From 4414a40c440599210b75fe64f1c000a2dd685208 Mon Sep 17 00:00:00 2001 From: tvdboom Date: Mon, 28 Sep 2020 19:04:21 +0200 Subject: [PATCH] drop windows & max jobs --- .travis.yml | 22 ++++------------------ docs/index.html | 2 +- docs/search/search_index.json | 2 +- docs/sitemap.xml.gz | Bin 746 -> 746 bytes docs/user_guide/index.html | 2 +- docs_sources/user_guide.md | 2 +- 6 files changed, 8 insertions(+), 22 deletions(-) diff --git a/.travis.yml b/.travis.yml index 33b40d426..c07633d5b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,22 +1,8 @@ language: python -jobs: - include: - - name: "Python on Xenial Linux" - python: - - "3.6" - - "3.7" - - "3.8" - - name: "Python 3.7.4 on macOS" - os: osx - osx_image: xcode12u # Python 3.7.4 running on macOS 10.14.4 - language: shell # 'language: python' is an error on Travis CI macOS - - name: "Python 3.8.0 on Windows" - os: windows # Windows 10.0.17134 N/A Build 17134 - language: shell # 'language: python' is an error on Travis CI Windows - before_install: - - choco install python --version 3.8.0 - - python -m pip install --upgrade pip - env: PATH=/c/Python38:/c/Python38/Scripts:$PATH +python: + - "3.6" # Current default Python on Travis CI + - "3.7" + - "3.8" before_install: - pip install -U pip - pip install -U pytest diff --git a/docs/index.html b/docs/index.html index b6a9cb218..a4b1d4f00 100644 --- a/docs/index.html +++ b/docs/index.html @@ -552,5 +552,5 @@ diff --git a/docs/search/search_index.json b/docs/search/search_index.json index a122b52f2..969fd4ba6 100644 --- a/docs/search/search_index.json +++ b/docs/search/search_index.json @@ -1 +1 @@ -{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Automated Tool for Optimized Modelling There is no magic formula in data science that can tell us which type of machine learning algorithm will perform best for a specific use-case. Different models are better suited for different types of data and different problems. At best, you can follow some rough guide on how to approach problems with regard to which model to try on your data, but these are often more confusing than helpful. Best practices tell us to start with a simple model (e.g. linear regression) and build up to more complicated models (e.g. logistic regression -> random forest -> multilayer perceptron) if you are not satisfied with the results. Unfortunately, different models require different data cleaning steps, different type/amount of features, tuning a new set of hyperparameters, etc. Refactoring the code for this purpose can be quite boring and time consuming. Because of this, many data scientists end up just using the model best known to them and fine-tuning this particular model without ever trying different ones. This can result in poor performance (because the model is just not the right one for the task) or in poor time management (because you could have achieved a similar performance with a simpler/faster model). ATOM is here to help us solve these issues. With just a few lines of code, you can perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset. ATOM should be able to provide quick insights on which algorithms perform best for the task at hand and provide an indication of the feasibility of the ML solution. It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you to determine the right model, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance. So, this sounds a bit like AutoML, how is ATOM different than auto-sklearn or TPOT ? Well, ATOM does AutoML in the sense that it helps you find the best model for a specific task, but contrary to the aforementioned packages, it does not actively search for the best model. It simply runs all of them and let you pick the one that you think suites you best. AutoML packages are often black boxes: if you provide data, it will magically return a working model. Although it works great, they often produce complicated pipelines with low explainability, hard to sell to the business. In this, ATOM excels. Every step of the pipeline is accounted for, and using the provided plotting methods, it\u2019s easy to demonstrate why a model is better/worse than the other. Note A data scientist with domain knowledge can outperform ATOM if he applies usecase-specific feature engineering or data cleaning steps! Example steps taken by ATOM's pipeline: Data Cleaning Handle missing values Encode categorical features Remove outliers Balance the dataset Feature engineering Create new non-linear features Remove multi-collinear features Remove features with too low variance Select the most promising features based on a statistical test Train and validate multiple models Select hyperparameters using a Bayesian Optimization approach Train and test the models on the provided data Perform bagging to assess the robustness of the output Analyze the results Get the model scores on various metrics Make plots to compare the model performances","title":"Home"},{"location":"#automated-tool-for-optimized-modelling","text":"There is no magic formula in data science that can tell us which type of machine learning algorithm will perform best for a specific use-case. Different models are better suited for different types of data and different problems. At best, you can follow some rough guide on how to approach problems with regard to which model to try on your data, but these are often more confusing than helpful. Best practices tell us to start with a simple model (e.g. linear regression) and build up to more complicated models (e.g. logistic regression -> random forest -> multilayer perceptron) if you are not satisfied with the results. Unfortunately, different models require different data cleaning steps, different type/amount of features, tuning a new set of hyperparameters, etc. Refactoring the code for this purpose can be quite boring and time consuming. Because of this, many data scientists end up just using the model best known to them and fine-tuning this particular model without ever trying different ones. This can result in poor performance (because the model is just not the right one for the task) or in poor time management (because you could have achieved a similar performance with a simpler/faster model). ATOM is here to help us solve these issues. With just a few lines of code, you can perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset. ATOM should be able to provide quick insights on which algorithms perform best for the task at hand and provide an indication of the feasibility of the ML solution. It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you to determine the right model, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance. So, this sounds a bit like AutoML, how is ATOM different than auto-sklearn or TPOT ? Well, ATOM does AutoML in the sense that it helps you find the best model for a specific task, but contrary to the aforementioned packages, it does not actively search for the best model. It simply runs all of them and let you pick the one that you think suites you best. AutoML packages are often black boxes: if you provide data, it will magically return a working model. Although it works great, they often produce complicated pipelines with low explainability, hard to sell to the business. In this, ATOM excels. Every step of the pipeline is accounted for, and using the provided plotting methods, it\u2019s easy to demonstrate why a model is better/worse than the other. Note A data scientist with domain knowledge can outperform ATOM if he applies usecase-specific feature engineering or data cleaning steps! Example steps taken by ATOM's pipeline: Data Cleaning Handle missing values Encode categorical features Remove outliers Balance the dataset Feature engineering Create new non-linear features Remove multi-collinear features Remove features with too low variance Select the most promising features based on a statistical test Train and validate multiple models Select hyperparameters using a Bayesian Optimization approach Train and test the models on the provided data Perform bagging to assess the robustness of the output Analyze the results Get the model scores on various metrics Make plots to compare the model performances","title":"Automated Tool for Optimized Modelling"},{"location":"dependencies/","text":"Python As of the moment, ATOM supports Python 3.6 , 3.7 and 3.8 . Packages ATOM is built on top of several existing Python libraries. The required packages are necessary for it's correct functioning. Additionally, you can install some optional packages to use machine learning estimators not provided by sklearn. Required numpy (>=1.17.2) scipy (>=1.4.1) pandas (>=1.0.3) tqdm (>=4.35.0) joblib (>=0.16.0) typeguard (>=2.7.1) tabulate (>=0.8.6) scikit-learn (>=0.23.1) scikit-optimize (>=0.7.4) pandas-profiling (>=2.3.0) category-encoders (>=2.1.0) imbalanced-learn (>=0.5.0) featuretools (>=0.17.0) gplearn (>=0.4.1) matplotlib (>=3.3.0) seaborn (>=0.9.0) shap (>=0.36.0) Optional xgboost (>=0.90) lightgbm (>=2.3.0) catboost (>=0.19.1) Support ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.","title":"Dependencies"},{"location":"dependencies/#python","text":"As of the moment, ATOM supports Python 3.6 , 3.7 and 3.8 .","title":"Python"},{"location":"dependencies/#packages","text":"ATOM is built on top of several existing Python libraries. The required packages are necessary for it's correct functioning. Additionally, you can install some optional packages to use machine learning estimators not provided by sklearn.","title":"Packages"},{"location":"dependencies/#required","text":"numpy (>=1.17.2) scipy (>=1.4.1) pandas (>=1.0.3) tqdm (>=4.35.0) joblib (>=0.16.0) typeguard (>=2.7.1) tabulate (>=0.8.6) scikit-learn (>=0.23.1) scikit-optimize (>=0.7.4) pandas-profiling (>=2.3.0) category-encoders (>=2.1.0) imbalanced-learn (>=0.5.0) featuretools (>=0.17.0) gplearn (>=0.4.1) matplotlib (>=3.3.0) seaborn (>=0.9.0) shap (>=0.36.0)","title":"Required"},{"location":"dependencies/#optional","text":"xgboost (>=0.90) lightgbm (>=2.3.0) catboost (>=0.19.1)","title":"Optional"},{"location":"dependencies/#support","text":"ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.","title":"Support"},{"location":"getting_started/","text":"Installation Note Since atom was already taken, download the package under the name atom-ml ! Intall ATOM's newest release easily via pip : $ pip install -U atom-ml or via conda : $ conda install -c conda-forge atom-ml Usage Call the ATOMClassifier or ATOMRegressor class and provide the data you want to use: from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier X, y = load_breast_cancer(return_X_y) atom = ATOMClassifier(X, y, logger='auto', n_jobs=2, verbose=2) ATOM has multiple data cleaning methods to help you prepare the data for modelling: atom.impute(strat_num='knn', strat_cat='most_frequent', min_frac_rows=0.1) atom.encode(strategy='Target', max_onehot=8, frac_to_other=0.05) atom.feature_selection(strategy='PCA', n_features=12) Train and evaluate the models you want to compare: atom.run(models=['LR', 'LDA', 'XGB', 'lSVM'], metric='f1', n_calls=25, n_initial_points=10, bagging=4) Make plots to analyze the results: atom.plot_bagging(figsize=(9, 6), filename='bagging_results.png') atom.LDA.plot_confusion_matrix(normalize=True, filename='cm.png')","title":"Getting started"},{"location":"getting_started/#installation","text":"Note Since atom was already taken, download the package under the name atom-ml ! Intall ATOM's newest release easily via pip : $ pip install -U atom-ml or via conda : $ conda install -c conda-forge atom-ml","title":"Installation"},{"location":"getting_started/#usage","text":"Call the ATOMClassifier or ATOMRegressor class and provide the data you want to use: from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier X, y = load_breast_cancer(return_X_y) atom = ATOMClassifier(X, y, logger='auto', n_jobs=2, verbose=2) ATOM has multiple data cleaning methods to help you prepare the data for modelling: atom.impute(strat_num='knn', strat_cat='most_frequent', min_frac_rows=0.1) atom.encode(strategy='Target', max_onehot=8, frac_to_other=0.05) atom.feature_selection(strategy='PCA', n_features=12) Train and evaluate the models you want to compare: atom.run(models=['LR', 'LDA', 'XGB', 'lSVM'], metric='f1', n_calls=25, n_initial_points=10, bagging=4) Make plots to analyze the results: atom.plot_bagging(figsize=(9, 6), filename='bagging_results.png') atom.LDA.plot_confusion_matrix(normalize=True, filename='cm.png')","title":"Usage"},{"location":"license/","text":"MIT License Copyright (c) 2020 tvdboom Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.","title":"License"},{"location":"license/#mit-license","text":"Copyright (c) 2020 tvdboom Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.","title":"MIT License"},{"location":"user_guide/","text":"Introduction There is no magic formula in data science that can tell us which type of machine learning algorithm will perform best for a specific use-case. Different models are better suited for different types of data and different problems. At best, you can follow some rough guide on how to approach problems with regard to which model to try on your data, but these are often more confusing than helpful. Best practices tell us to start with a simple model (e.g. linear regression) and build up to more complicated models (e.g. logistic regression -> random forest -> multilayer perceptron) if you are not satisfied with the results. Unfortunately, different models require different data cleaning steps, different type/amount of features, tuning a new set of hyperparameters, etc. Refactoring the code for this purpose can be quite boring and time consuming. Because of this, many data scientists end up just using the model best known to them and fine-tuning this particular model without ever trying different ones. This can result in poor performance (because the model is just not the right one for the task) or in poor time management (because you could have achieved a similar performance with a simpler/faster model). ATOM is here to help us solve these issues. With just a few lines of code, you can perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset. ATOM should be able to provide quick insights on which algorithms perform best for the task at hand and provide an indication of the feasibility of the ML solution. It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you to determine the right model, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance. So, this sounds a bit like AutoML, how is ATOM different than auto-sklearn or TPOT ? Well, ATOM does AutoML in the sense that it helps you find the best model for a specific task, but contrary to the aforementioned packages, it does not actively search for the best model. It simply runs all of them and let you pick the one that you think suites you best. AutoML packages are often black boxes: if you provide data, it will magically return a working model. Although it works great, they often produce complicated pipelines with low explainability, hard to sell to the business. In this, ATOM excels. Every step of the pipeline is accounted for, and using the provided plotting methods, it\u2019s easy to demonstrate why a model is better/worse than the other. Nomenclature In this documentation we will consistently use terms to refer to certain concepts related to the ATOM package. ATOM : Refers to this package. task : Refers to one of the three supervised machine learning approaches that ATOM supports: binary classification multiclass classification regression category : Refers to one of the unique values in a column, i.e. a binary classifier has 2 categories in the target column. missing values : Refers to None , NaN and inf values. categorical columns : Refers to all columns with dtype.kind not in ifu . atom : Refers to an ATOMClassifier or ATOMRegressor instance (note that all examples use it as variable name for the instance). model : Refers to one of the model instances. estimator : Actual estimator corresponding to a model. Implemented by an external package. BO : Bayesian optimization algorithm used for hyperparameter optimization. training : Refers to an instance of one of the classes that train and evaluate the models. The classes are: ATOMClassifier ATOMRegressor TrainerClassifier TrainerRegressor SuccessiveHalvingClassifier SuccessiveHavingRegressor TrainSizingClassifier TrainSizingRegressor Note Note that atom instances are also training instances! First steps You can quickly install atom using pip or conda , see the installation guide . ATOM contains a variety of classes to perform data cleaning, feature engineering, model training and much more. The easiest way to use all these classes on the same dataset is through one of the main classes: ATOMClassifier for binary or multiclass classification tasks. ATOMRegressor for regression tasks. These two classes are convenient wrappers for all the possibilities this package provides. Like a Pipeline , they assemble several steps that can be cross-validated together while setting different parameters. There are some important differences with sklearn's API: atom is initialized with the data you want to manipulate. This data can be accessed at any moment through atom 's data attributes . The classes in ATOM's API are reached through atom 's methods. For example, calling the encode method, will initialize an Encoder instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (there is no fit method). This approach gives the user a clearer overview and more control over every step in the pipeline. The pipeline does not have to end with an estimator. ATOM can be just for data cleaning or feature engineering purposes only. Let's get started with an example! First, initialize atom and provide it the data you want to use. atom = ATOMClassifier(X, y) Apply data cleaning methods through the class. For example, calling the impute method will handle all missing values in the dataset. atom.impute(strat_num='median', strat_cat='most_frequent', min_frac_rows=0.1) Select the best hyperparameters and fit a Random Forest and AdaBoost model. atom.run(['RF', 'AdaB'], metric='accuracy', n_calls=25, n_initial_points=10) Analyze the results: atom.feature_importances(show=10, filename='feature_importance_plot') atom.plot_prc(title='Precision-recall curve comparison plot') Data cleaning More often than not, you need to do some data cleaning before fitting your dataset to a model. Usually, this involves importing different libraries and writing many lines of code. Since ATOM is all about fast exploration and experimentation, it provides various data cleaning classes to apply the most common transformations fast and easy. Scaling the feature set Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance). The Scaler class scales data to mean=0 and std=1. It can be accessed from atom through the scale method. Standard data cleaning There are many data cleaning steps that are useful to perform on any dataset before modelling. These are general rules that apply on every use-case and every task. The StandardCleaner class is a convenient tool to apply such steps. It is automatically called when initializing atom . Use the class' parameters to choose which transformations to perform. The available steps are: Remove columns with prohibited data types. Strip categorical features from white spaces. Remove categorical columns with maximal cardinality. Remove columns with minimum cardinality. Remove rows with missing values in the target column. Label-encode the target column. Imputing missing values For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with ATOM's models which assume that all values in an array are numerical, and that all have and hold meaning. The Imputer class handles missing values in the dataset by either dropping or imputing the value. It can be accessed from atom through the impute method. Tip Use atom 's missing attribute for an overview of the missing values in the dataset. Encoding categorical features Many datasets will contain categorical features. Their variables are typically stored as text values which represent various traits. Some examples include color (\u201cRed\u201d, \u201cYellow\u201d, \u201cBlue\u201d), size (\u201cSmall\u201d, \u201cMedium\u201d, \u201cLarge\u201d) or geographic designations (city or country). Regardless of what the value is used for, the challenge is determining how to use this data in the analysis. ATOM's models don't support direct manipulation of this kind of data. Use the Encoder class to encode categorical features to numerical values. It can be accessed from atom through the encode method. Tip Use atom 's categorical attribute for a list of the categorical columns in the dataset. Handling outliers When modeling, it is important to clean the data sample to ensure that the observations best represent the problem. Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data. These are called outliers. Often, machine learning modeling and model skill in general can be improved by understanding and even removing these outlier values. The Outliers class can drop or impute outliers in the dataset. It can be accessed from atom through the outliers method. Balancing the data One of the common issues found in datasets that are used for classification is imbalanced classes. Data imbalance usually reflects an unequal distribution of classes within a dataset. For example, in a credit card fraud detection dataset, most of the transactions are non-fraud and a very few cases are fraud. This leaves us with a very unbalanced ratio of fraud vs non-fraud cases. The Balancer class can oversample the minority category or undersample the majority category. It can be accessed from atom through the balance method. Feature engineering \"Applied machine learning\" is basically feature engineering. ~ Andrew Ng. Feature engineering is the process of creating new features from the existing ones, in order to capture relationships with the target column that the first set of features didn't had on their own. This process is very important to improve the performance of machine learning algorithms. Although feature engineering works best when the data scientist applies use-case specific transformations, there are ways to do this in an automated manner, without prior domain knowledge. One of the problems of creating new features without human expert intervention, is that many of the newly created features can be useless, i.e. they do not help the algorithm to make better predictions. Even worse, having useless features can drop your performance. To avoid this, we perform feature selection, a process in which we select the relevant features in the dataset. See here an example. Generating new features The FeatureGenerator class creates new non-linear features based on the original feature set. It can be accessed from atom through the feature_generation method. You can choose between two strategies: Deep Feature Synthesis and Genetic Feature Generation. Deep Feature Synthesis Deep feature synthesis (DFS) applies the selected operators on the features in the dataset. For example, if the operator is 'log', it will create the new feature LOG(old_feature) and if the operator is 'mul', it will create the new feature old_feature_1 x old_feature_2 . The operators can be chosen through the operators parameter. Available options are: add: Sum two features together. sub: Subtract two features from each other. mul: Multiply two features with each other. div: Divide two features with each other. srqt: Take the square root of a feature. log: Take the logarithm of a feature. sin: Calculate the sine of a feature. cos: Calculate the cosine of a feature. tan: Calculate the tangent of a feature. ATOM's implementation of DFS uses the featuretools package. Tip DFS can create many new features and not all of them will be useful. Use FeatureSelector to reduce the number of features! Warning Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom 's missing property. Warning When using DFS with n_jobs>1 , make sure to protect your code with if __name__ == \"__main__\" . Featuretools uses dask , which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task. Genetic Feature Generation Genetic feature generation (GFG) uses genetic programming , a branch of evolutionary programming, to determine which features are successful and create new ones based on those. Where DFS' method can be seen as some kind of \"brute force\" for feature engineering, GFG tries to improve its features with every generation of the algorithm. GFG uses the same operators as DFS, but instead of only applying the transformations once, it evolves them further, creating complicated non-linear combinations of features with many transformations. The new features are given the name Feature N for the N-th feature. You can access the genetic feature's fitness and description (how they are calculated) through the genetic_features attribute. ATOM uses the SymbolicTransformer class from the gplearn package for the genetic algorithm. Read more about this implementation here . Warning GFG can be slow for very large populations! Selecting useful features The FeatureSelector class provides tooling to select the relevant features from a dataset. It can be accessed from atom through the feature_selection method. The following strategies are implemented: univariate, PCA, SFM, RFE and RFECV. Univariate Univariate feature selection works by selecting the best features based on univariate statistical F-test. The test is provided via the solver parameter. It takes any function taking two arrays (X, y), and returning arrays (scores, p-values). Read more in sklearn's documentation . Principal Components Analysis Applying PCA will reduce the dimensionality of the dataset by maximizing the variance of each dimension. The new features will be called Component 0, Component 1, etc... The dataset will be scaled before applying the transformation (if it wasn't already). Read more in sklearn's documentation . Selection from model SFM uses an estimator with feature_importances_ or coef_ attributes to select the best features in a dataset based on importance weights. The estimator is provided through the solver parameter and can be already fitted. ATOM allows you to use one its pre-defined models , e.g. solver='RF' . If you didn't call the FeatureSeletor through atom , don't forget to indicate the estimator's task adding _class or _reg after the name, e.g. RF_class to use a random forest classifier. Read more in sklearn's documentation . Recursive feature elimination Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Note that, since RFE needs to fit the model again every iteration, this method can be fairly slow. RFECV applies the same algorithm as RFE but uses a cross-validated metric (under the scoring parameter, see RFECV ) to assess every step's performance. Also, where RFE returns the number of features selected by n_features , RFECV returns the number of features that achieved the optimal score on the specified metric. Note that this is not always equal to the amount specified by n_features . Read more in sklearn's documentation . Removing features with low variance Variance is the expectation of the squared deviation of a random variable from its mean. Features with low variance have many values repeated, which means the model will not learn much from them. FeatureSelector removes all features where the same value is repeated in at least max_frac_repeated fraction of the rows. The default option is to remove a feature if all values in it are the same. Read more in sklearn's documentation . Removing features with multi-collinearity Two features that are highly correlated are redundant, i.e. two will not contribute more to the model than only one of them. FeatureSelector will drop a feature that has a Pearson correlation coefficient larger than max_correlation with another feature. A correlation of 1 means the two columns are equal. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute. Tip Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead. Warning The RFE and RFECV strategies don't work when the solver is a CatBoost model due to incompatibility of the APIs. Training The training phase is where the models are fitted and evaluated. After this, the models are attached to the training instance and you can use the plotting and predicting methods. The pipeline applies the following steps iteratively for all models: The optimal hyperparameters are selected. The model is trained on the training set and evaluated on the test set. The bagging algorithm is applied. There are three approaches to run the training. Direct training: TrainerClassifier TrainerRegressor Training via successive halving : SuccessiveHalvingClassifier SuccessiveHavingRegressor Training via train sizing : TrainSizingClassifier TrainSizingRegressor The direct fashion repeats the aforementioned steps only once, while the other two approaches repeats them more than once. Every approach can be directly called from atom through the run , successive_halving and train_sizing methods respectively. A couple of things to take into account: If an exception is encountered while fitting an estimator, the pipeline will automatically skip the model and jump to the next model and save the exception in the errors attribute. Note that in that case there will be no model for that estimator. When showing the final results, a !! indicates the highest score and a ~ indicates that the model is possibly overfitting (training set has a score at least 20% higher than the test set). The winning model (the one with the highest mean_bagging or metric_test ) will be attached to the winner attribute. Models ATOM provides 27 models for classification and regression tasks that can be used to fit the data in the pipeline. After fitting, every model is attached to the training instance as an attribute. Models are called through the models parameter using their corresponding acronym's, e.g. atom.run(models='RF') to run a Random forest model. Metric ATOM uses sklearn's scorers for model selection and evaluation. A scorer consists of a metric function and some parameters that define the scorer's properties such as it's a score or loss function or if the function needs probability estimates or rounded predictions (see make_scorer ). ATOM lets you define the scorer for the pipeline in three ways: The metric parameter is one of sklearn's predefined scorers (as string). The metric parameter is a score (or loss) function with signature metric(y, y_pred, **kwargs). In this case, use the greater_is_better , needs_proba and needs_threshold parameters to specify the scorer's properties. The metric parameter is a scorer object. Note that all scorers follow the convention that higher return values are better than lower return values. Thus metrics which measure the distance between the model and the data (i.e. loss functions), like max_error or mean_squared_error , will return the negated value of the metric. Custom scorer acronyms Since some of sklearn's scorers have quite long names and ATOM is all about lazy fast experimentation, the package provides acronyms for some of the most commonly used ones. These acronyms are case insensitive can be used for the metric parameter instead of the scorer's full name, e.g. atom.run('LR', metric='BA') will use balanced_accuracy . The available acronyms are: 'AP' for 'average_precision' 'BA' for 'balanced_accuracy' 'AUC' for 'roc_auc' 'EV' for 'explained_variance' 'ME' for 'max_error' 'MAE' for 'neg_mean_absolute_error' 'MSE' for 'neg_mean_squared_error' 'RMSE' for 'neg_root_mean_squared_error' 'MSLE' for 'neg_mean_squared_log_error' 'MEDAE' for 'neg_median_absolute_error' 'POISSON' for 'neg_mean_poisson_deviance' 'GAMMA' for 'neg_mean_gamma_deviance' Multi-metric runs Sometimes it is useful to measure the performance of the models in more than one way. ATOM lets you run the pipeline with multiple metrics at the same time. To do so, provide the metric parameter with a list of desired metrics, e.g. atom.run('LDA', metric=['r2', 'mse']) . If you provide metric functions, don't forget to also provide lists to the greater_is_better , needs_proba and needs_threshold parameters, where the n-th value in the list corresponds to the n-th function. If you leave them as a single value, that value will apply to every provided metric. When fitting multi-metric runs, the resulting scores will return a list of metrics. For example, if you provided three metrics to the pipeline, atom.knn.metric_bo could return [0.8734, 0.6672, 0.9001]. It is also important to note that only the first metric of a multi-metric run is used to evaluate every step of the bayesian optimization and to select the winning model. Tip Some plots let you choose which of the metrics to show using the metric parameter. Hyperparameter optimization In order to achieve maximum performance, we need to tune an estimator's hyperparameters before training it. ATOM provides hyperparameter tuning using a bayesian optimization (BO) approach implemented by skopt . The BO is optimized on the first metric provided with the metric parameter. Each step is either computed by cross-validation on the complete training set or by randomly splitting the training set every iteration into a (sub) training set and a validation set. This process can create some data leakage but ensures maximal use of the provided data. The test set, however, does not contain any leakage and will be used to determine the final score of every model. Note that, if the dataset is relatively small, the BO's best score can consistently be lower than the final score on the test set (despite the leakage) due to the considerable fewer instances on which it is trained. There are many possibilities to tune the BO to your liking. Use n_calls and n_initial_points to determine the number of iterations that are performed randomly at the start (exploration) and the number of iterations spent optimizing (exploitation). If n_calls is equal to n_initial_points , every iteration of the BO will select its hyperparameters randomly. This means the algorithm is technically performing a random search . Note The n_calls parameter includes the iterations in n_initial_points . Calling atom.run('LR', n_calls=20, n_intial_points=10) will run 20 iterations of which the first 10 are random. Other settings can be changed through the bo_params parameter, a dictionary where every key-value combination can be used to further customize the BO. By default, the hyperparameters and corresponding dimensions per model are predefined by ATOM. Use the dimensions key to use custom ones. Use an array for only one model and a dictionary with the model names as keys if there are multiple models in the pipeline. Note that the provided search space dimensions must be compliant with skopt's API. atom.run('LR', n_calls=10, bo_params={'dimensions': [Integer(100, 1000, name='max_iter')]}) The majority of skopt's callbacks to stop the optimizer early can be accessed through bo_params . You can include other callbacks using the callbacks key. atom.run('LR', n_calls=10, bo_params={'max_time': 1000, 'callbacks': custom_callback()}) You can also include other optimizer's parameters as key-value pairs. atom.run('LR', n_calls=10, bo_params={'acq_func': 'EI'}) Bagging After fitting the estimator, you can asses the robustness of the model using bootstrap aggregating (bagging). This technique creates several new data sets selecting random samples from the training set (with replacement) and evaluates them on the test set. This way we get a distribution of the performance of the model. The number of sets can be chosen through the bagging parameter. Tip Use the plot_bagging method to plot the bagging scores in a convenient boxplot. Early stopping XGBoost , LighGBM and CatBoost allow in-training evaluation. This means that the estimator is evaluated after every round of the training. Use the early_stopping key in bo_params to stop the training early if it didn't improve in the last early_stopping rounds. This can save the pipeline much time that would otherwise be wasted on an estimator that is unlikely to improve further. Note that this technique will be applied both during the BO and at the final fit on the complete training set. After fitting, the model will get the evals attribute, a dictionary of the train and test performances per round (also if early stopping wasn't applied). Tip Use the plot_evals method to plot the in-training evaluation on the train and test set. Successive halving Successive halving is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, we recommend only to use this technique with similar models, e.g. only using tree-based models. Use successive halving through the SuccessiveHalvingClassifier / SuccessiveHalvingRegressor classes or from atom via the successive_halving method. After running the pipeline, the results attribute will be multi-index, where the first index indicates the iteration and the second the model's acronym. Tip Use the plot_successive_halving method to see every model's performance per iteration of the successive halving. Train sizing When training models, there is usually a trade-off between model performance and computation time that is regulated by the number of samples in the training set. Train sizing can be used to create insights in this trade-off and help determine the optimal size of the training set, fitting the models multiple times, ever increasing the number of samples in the training set. Use train sizing through the TrainSizingClassifier / TrainSizingRegressor classes or from atom via the train_sizing method. The number of iterations and the number of samples per training can be specified with the train_sizes parameter. After running the pipeline, the results attribute will be multi-index, where the first index indicates the iteration and the second the model's acronym. Tip Use the plot_learning_curve method to see the model's performance per size of the training set. Predicting After running a successful pipeline, it is possible you would like to apply all used transformations onto new data, or make predictions using one of the trained models. Just like a sklearn estimator, you can call the prediction methods from a fitted training instance, e.g. atom.predict(X) . Calling the method without specifying a model will use the winning model in the pipeline (under attribute winner ). To use a different model, simply call the method from a model , e.g. atom.KNN.predict(X) . If called from atom , the prediction methods will transform the provided data through all the transformers in the pipeline before making the predictions. By default, this excludes outlier handling and balancing the dataset since these steps should only be applied on the training set. Use the method's kwargs to select which transformations to use in every call. The available prediction methods are a selection of the most common methods for estimators in sklearn's API: transform Transform new data through all the pre-processing steps in the pipeline. predict Transform the data and make predictions on new data. predict_proba Transform the data and make probabilistic predictions on new data. predict_log_proba Transform the data and make logarithmic probability predictions on new data. decision_function Transform the data and evaluate the decision function on new data. score Transform the data and return the model's score on new data. Except for transform, the prediction methods can be calculated on the train and test set. You can access them through the model 's prediction attributes , e.g. atom.mnb.predict_train or atom.mnb.predict_test . Keep in mind that the results are not calculated until the attribute is called for the first time. This mechanism avoids having to calculate attributes that are never used, saving time and memory. Note Many of the plots use the prediction attributes. This can considerably increase the size of the class for large datasets. Use the reset_prediction_attributes method if you need to free some memory! Plots After fitting the models to the data, it's time to analyze the results. ATOM provides many plotting methods to compare the model performances. Descriptions and examples can be found in the API section. ATOM uses the packages matplotlib , seaborn and shap for plotting. The plot methods can be called from a training directly, e.g. atom.plot_roc() , or from one of the models , e.g. atom.LGB.plot_roc() . If called from training , it will make the plot for all models in the pipeline. This can be useful to compare the results of multiple models. If called from a model , it will make the plot for only that model. Use this option if you want information just for that specific model or to make a plot less crowded. Parameters Apart from the plot-specific parameters they may have, all plots have four parameters in common: The title parameter allows you to add a custom title to the plot. The figsize parameter adjust the plot's size. The filename parameter is used to save the plot. The display parameter determines whether the plot is rendered. Aesthetics The plot aesthetics can be customized using the plot attributes, e.g. atom.style = 'white' . These attributes can be called from any instance with plotting methods. Note that the plot attributes are attached to the class and not the instance. This means that changing the attribute will also change it for all other instances in the module. ATOM's default values are: style: 'darkgrid' palette: 'GnBu_r_d' title_fontsize: 20 label_fontsize: 16 tick_fontsize: 12 SHAP The SHAP (SHapley Additive exPlanations) python package uses a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. ATOM implements methods to plot 4 of shap's plotting functions directly from its API. The explainer will be chosen automatically based on the model's type. For kernelExplainer, the data used to estimate the expected values is the complete training set when <100 rows, else its summarized with a set of 10 weighted K-means, each weighted by the number of points they represent. The four plots are: force_plot , dependence_plot , summary_plot and decision_plot . Since the plots are not made by ATOM, we can't draw multiple models in the same figure. Selecting more than one model will raise an exception. To avoid this, call the plot from a model , e.g. atom.xgb.force_plot() . Note You can recognize the SHAP plots by the fact that they end (instead of start) with plot. Available plots A list of available plots can be find hereunder. Note that not all plots can be called from every class and that their availability can depend on the task at hand. plot_correlation Plot the data's correlation matrix. plot_pipeline Plot a diagram of every estimator in atom's pipeline. plot_pca Plot the explained variance ratio vs the number of components. plot_components Plot the explained variance ratio per components. plot_rfecv Plot the RFECV results. plot_successive_halving Plot of the models' scores per iteration of the successive halving. plot_learning_curve Plot the model's learning curve. plot_bagging Plot a boxplot of the bagging's results. plot_bo Plot the bayesian optimization scoring. plot_evals Plot evaluation curves for the train and test set. plot_roc Plot the Receiver Operating Characteristics curve. plot_prc Plot the precision-recall curve. plot_permutation_importance Plot the feature permutation importance of models. plot_feature_importance Plot a tree-based model's feature importance. plot_partial_dependence Plot the partial dependence of features. plot_errors Plot a model's prediction errors. plot_residuals Plot a model's residuals. plot_confusion_matrix Plot a model's confusion matrix. plot_threshold Plot a metric's performance against threshold values. plot_probabilities Plot the probability distribution of the categories in the target column. plot_calibration Plot the calibration curve for a binary classifier. plot_gains Plot the cumulative gains curve. plot_lift Plot the lift curve. force_plot Plot SHAP's force plot. dependence_plot Plot SHAP's dependence plot. summary_plot Plot SHAP's summary plot. decision_plot Plot SHAP's decision plot.","title":"User guide"},{"location":"user_guide/#introduction","text":"There is no magic formula in data science that can tell us which type of machine learning algorithm will perform best for a specific use-case. Different models are better suited for different types of data and different problems. At best, you can follow some rough guide on how to approach problems with regard to which model to try on your data, but these are often more confusing than helpful. Best practices tell us to start with a simple model (e.g. linear regression) and build up to more complicated models (e.g. logistic regression -> random forest -> multilayer perceptron) if you are not satisfied with the results. Unfortunately, different models require different data cleaning steps, different type/amount of features, tuning a new set of hyperparameters, etc. Refactoring the code for this purpose can be quite boring and time consuming. Because of this, many data scientists end up just using the model best known to them and fine-tuning this particular model without ever trying different ones. This can result in poor performance (because the model is just not the right one for the task) or in poor time management (because you could have achieved a similar performance with a simpler/faster model). ATOM is here to help us solve these issues. With just a few lines of code, you can perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset. ATOM should be able to provide quick insights on which algorithms perform best for the task at hand and provide an indication of the feasibility of the ML solution. It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you to determine the right model, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance. So, this sounds a bit like AutoML, how is ATOM different than auto-sklearn or TPOT ? Well, ATOM does AutoML in the sense that it helps you find the best model for a specific task, but contrary to the aforementioned packages, it does not actively search for the best model. It simply runs all of them and let you pick the one that you think suites you best. AutoML packages are often black boxes: if you provide data, it will magically return a working model. Although it works great, they often produce complicated pipelines with low explainability, hard to sell to the business. In this, ATOM excels. Every step of the pipeline is accounted for, and using the provided plotting methods, it\u2019s easy to demonstrate why a model is better/worse than the other.","title":"Introduction"},{"location":"user_guide/#nomenclature","text":"In this documentation we will consistently use terms to refer to certain concepts related to the ATOM package. ATOM : Refers to this package. task : Refers to one of the three supervised machine learning approaches that ATOM supports: binary classification multiclass classification regression category : Refers to one of the unique values in a column, i.e. a binary classifier has 2 categories in the target column. missing values : Refers to None , NaN and inf values. categorical columns : Refers to all columns with dtype.kind not in ifu . atom : Refers to an ATOMClassifier or ATOMRegressor instance (note that all examples use it as variable name for the instance). model : Refers to one of the model instances. estimator : Actual estimator corresponding to a model. Implemented by an external package. BO : Bayesian optimization algorithm used for hyperparameter optimization. training : Refers to an instance of one of the classes that train and evaluate the models. The classes are: ATOMClassifier ATOMRegressor TrainerClassifier TrainerRegressor SuccessiveHalvingClassifier SuccessiveHavingRegressor TrainSizingClassifier TrainSizingRegressor Note Note that atom instances are also training instances!","title":"Nomenclature"},{"location":"user_guide/#first-steps","text":"You can quickly install atom using pip or conda , see the installation guide . ATOM contains a variety of classes to perform data cleaning, feature engineering, model training and much more. The easiest way to use all these classes on the same dataset is through one of the main classes: ATOMClassifier for binary or multiclass classification tasks. ATOMRegressor for regression tasks. These two classes are convenient wrappers for all the possibilities this package provides. Like a Pipeline , they assemble several steps that can be cross-validated together while setting different parameters. There are some important differences with sklearn's API: atom is initialized with the data you want to manipulate. This data can be accessed at any moment through atom 's data attributes . The classes in ATOM's API are reached through atom 's methods. For example, calling the encode method, will initialize an Encoder instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (there is no fit method). This approach gives the user a clearer overview and more control over every step in the pipeline. The pipeline does not have to end with an estimator. ATOM can be just for data cleaning or feature engineering purposes only. Let's get started with an example! First, initialize atom and provide it the data you want to use. atom = ATOMClassifier(X, y) Apply data cleaning methods through the class. For example, calling the impute method will handle all missing values in the dataset. atom.impute(strat_num='median', strat_cat='most_frequent', min_frac_rows=0.1) Select the best hyperparameters and fit a Random Forest and AdaBoost model. atom.run(['RF', 'AdaB'], metric='accuracy', n_calls=25, n_initial_points=10) Analyze the results: atom.feature_importances(show=10, filename='feature_importance_plot') atom.plot_prc(title='Precision-recall curve comparison plot')","title":"First steps"},{"location":"user_guide/#data-cleaning","text":"More often than not, you need to do some data cleaning before fitting your dataset to a model. Usually, this involves importing different libraries and writing many lines of code. Since ATOM is all about fast exploration and experimentation, it provides various data cleaning classes to apply the most common transformations fast and easy.","title":"Data cleaning"},{"location":"user_guide/#scaling-the-feature-set","text":"Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance). The Scaler class scales data to mean=0 and std=1. It can be accessed from atom through the scale method.","title":"Scaling the feature set"},{"location":"user_guide/#standard-data-cleaning","text":"There are many data cleaning steps that are useful to perform on any dataset before modelling. These are general rules that apply on every use-case and every task. The StandardCleaner class is a convenient tool to apply such steps. It is automatically called when initializing atom . Use the class' parameters to choose which transformations to perform. The available steps are: Remove columns with prohibited data types. Strip categorical features from white spaces. Remove categorical columns with maximal cardinality. Remove columns with minimum cardinality. Remove rows with missing values in the target column. Label-encode the target column.","title":"Standard data cleaning"},{"location":"user_guide/#imputing-missing-values","text":"For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with ATOM's models which assume that all values in an array are numerical, and that all have and hold meaning. The Imputer class handles missing values in the dataset by either dropping or imputing the value. It can be accessed from atom through the impute method. Tip Use atom 's missing attribute for an overview of the missing values in the dataset.","title":"Imputing missing values"},{"location":"user_guide/#encoding-categorical-features","text":"Many datasets will contain categorical features. Their variables are typically stored as text values which represent various traits. Some examples include color (\u201cRed\u201d, \u201cYellow\u201d, \u201cBlue\u201d), size (\u201cSmall\u201d, \u201cMedium\u201d, \u201cLarge\u201d) or geographic designations (city or country). Regardless of what the value is used for, the challenge is determining how to use this data in the analysis. ATOM's models don't support direct manipulation of this kind of data. Use the Encoder class to encode categorical features to numerical values. It can be accessed from atom through the encode method. Tip Use atom 's categorical attribute for a list of the categorical columns in the dataset.","title":"Encoding categorical features"},{"location":"user_guide/#handling-outliers","text":"When modeling, it is important to clean the data sample to ensure that the observations best represent the problem. Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data. These are called outliers. Often, machine learning modeling and model skill in general can be improved by understanding and even removing these outlier values. The Outliers class can drop or impute outliers in the dataset. It can be accessed from atom through the outliers method.","title":"Handling outliers"},{"location":"user_guide/#balancing-the-data","text":"One of the common issues found in datasets that are used for classification is imbalanced classes. Data imbalance usually reflects an unequal distribution of classes within a dataset. For example, in a credit card fraud detection dataset, most of the transactions are non-fraud and a very few cases are fraud. This leaves us with a very unbalanced ratio of fraud vs non-fraud cases. The Balancer class can oversample the minority category or undersample the majority category. It can be accessed from atom through the balance method.","title":"Balancing the data"},{"location":"user_guide/#feature-engineering","text":"\"Applied machine learning\" is basically feature engineering. ~ Andrew Ng. Feature engineering is the process of creating new features from the existing ones, in order to capture relationships with the target column that the first set of features didn't had on their own. This process is very important to improve the performance of machine learning algorithms. Although feature engineering works best when the data scientist applies use-case specific transformations, there are ways to do this in an automated manner, without prior domain knowledge. One of the problems of creating new features without human expert intervention, is that many of the newly created features can be useless, i.e. they do not help the algorithm to make better predictions. Even worse, having useless features can drop your performance. To avoid this, we perform feature selection, a process in which we select the relevant features in the dataset. See here an example.","title":"Feature engineering"},{"location":"user_guide/#generating-new-features","text":"The FeatureGenerator class creates new non-linear features based on the original feature set. It can be accessed from atom through the feature_generation method. You can choose between two strategies: Deep Feature Synthesis and Genetic Feature Generation. Deep Feature Synthesis Deep feature synthesis (DFS) applies the selected operators on the features in the dataset. For example, if the operator is 'log', it will create the new feature LOG(old_feature) and if the operator is 'mul', it will create the new feature old_feature_1 x old_feature_2 . The operators can be chosen through the operators parameter. Available options are: add: Sum two features together. sub: Subtract two features from each other. mul: Multiply two features with each other. div: Divide two features with each other. srqt: Take the square root of a feature. log: Take the logarithm of a feature. sin: Calculate the sine of a feature. cos: Calculate the cosine of a feature. tan: Calculate the tangent of a feature. ATOM's implementation of DFS uses the featuretools package. Tip DFS can create many new features and not all of them will be useful. Use FeatureSelector to reduce the number of features! Warning Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom 's missing property. Warning When using DFS with n_jobs>1 , make sure to protect your code with if __name__ == \"__main__\" . Featuretools uses dask , which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task. Genetic Feature Generation Genetic feature generation (GFG) uses genetic programming , a branch of evolutionary programming, to determine which features are successful and create new ones based on those. Where DFS' method can be seen as some kind of \"brute force\" for feature engineering, GFG tries to improve its features with every generation of the algorithm. GFG uses the same operators as DFS, but instead of only applying the transformations once, it evolves them further, creating complicated non-linear combinations of features with many transformations. The new features are given the name Feature N for the N-th feature. You can access the genetic feature's fitness and description (how they are calculated) through the genetic_features attribute. ATOM uses the SymbolicTransformer class from the gplearn package for the genetic algorithm. Read more about this implementation here . Warning GFG can be slow for very large populations!","title":"Generating new features"},{"location":"user_guide/#selecting-useful-features","text":"The FeatureSelector class provides tooling to select the relevant features from a dataset. It can be accessed from atom through the feature_selection method. The following strategies are implemented: univariate, PCA, SFM, RFE and RFECV. Univariate Univariate feature selection works by selecting the best features based on univariate statistical F-test. The test is provided via the solver parameter. It takes any function taking two arrays (X, y), and returning arrays (scores, p-values). Read more in sklearn's documentation . Principal Components Analysis Applying PCA will reduce the dimensionality of the dataset by maximizing the variance of each dimension. The new features will be called Component 0, Component 1, etc... The dataset will be scaled before applying the transformation (if it wasn't already). Read more in sklearn's documentation . Selection from model SFM uses an estimator with feature_importances_ or coef_ attributes to select the best features in a dataset based on importance weights. The estimator is provided through the solver parameter and can be already fitted. ATOM allows you to use one its pre-defined models , e.g. solver='RF' . If you didn't call the FeatureSeletor through atom , don't forget to indicate the estimator's task adding _class or _reg after the name, e.g. RF_class to use a random forest classifier. Read more in sklearn's documentation . Recursive feature elimination Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Note that, since RFE needs to fit the model again every iteration, this method can be fairly slow. RFECV applies the same algorithm as RFE but uses a cross-validated metric (under the scoring parameter, see RFECV ) to assess every step's performance. Also, where RFE returns the number of features selected by n_features , RFECV returns the number of features that achieved the optimal score on the specified metric. Note that this is not always equal to the amount specified by n_features . Read more in sklearn's documentation . Removing features with low variance Variance is the expectation of the squared deviation of a random variable from its mean. Features with low variance have many values repeated, which means the model will not learn much from them. FeatureSelector removes all features where the same value is repeated in at least max_frac_repeated fraction of the rows. The default option is to remove a feature if all values in it are the same. Read more in sklearn's documentation . Removing features with multi-collinearity Two features that are highly correlated are redundant, i.e. two will not contribute more to the model than only one of them. FeatureSelector will drop a feature that has a Pearson correlation coefficient larger than max_correlation with another feature. A correlation of 1 means the two columns are equal. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute. Tip Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead. Warning The RFE and RFECV strategies don't work when the solver is a CatBoost model due to incompatibility of the APIs.","title":"Selecting useful features"},{"location":"user_guide/#training","text":"The training phase is where the models are fitted and evaluated. After this, the models are attached to the training instance and you can use the plotting and predicting methods. The pipeline applies the following steps iteratively for all models: The optimal hyperparameters are selected. The model is trained on the training set and evaluated on the test set. The bagging algorithm is applied. There are three approaches to run the training. Direct training: TrainerClassifier TrainerRegressor Training via successive halving : SuccessiveHalvingClassifier SuccessiveHavingRegressor Training via train sizing : TrainSizingClassifier TrainSizingRegressor The direct fashion repeats the aforementioned steps only once, while the other two approaches repeats them more than once. Every approach can be directly called from atom through the run , successive_halving and train_sizing methods respectively. A couple of things to take into account: If an exception is encountered while fitting an estimator, the pipeline will automatically skip the model and jump to the next model and save the exception in the errors attribute. Note that in that case there will be no model for that estimator. When showing the final results, a !! indicates the highest score and a ~ indicates that the model is possibly overfitting (training set has a score at least 20% higher than the test set). The winning model (the one with the highest mean_bagging or metric_test ) will be attached to the winner attribute.","title":"Training"},{"location":"user_guide/#models","text":"ATOM provides 27 models for classification and regression tasks that can be used to fit the data in the pipeline. After fitting, every model is attached to the training instance as an attribute. Models are called through the models parameter using their corresponding acronym's, e.g. atom.run(models='RF') to run a Random forest model.","title":"Models"},{"location":"user_guide/#metric","text":"ATOM uses sklearn's scorers for model selection and evaluation. A scorer consists of a metric function and some parameters that define the scorer's properties such as it's a score or loss function or if the function needs probability estimates or rounded predictions (see make_scorer ). ATOM lets you define the scorer for the pipeline in three ways: The metric parameter is one of sklearn's predefined scorers (as string). The metric parameter is a score (or loss) function with signature metric(y, y_pred, **kwargs). In this case, use the greater_is_better , needs_proba and needs_threshold parameters to specify the scorer's properties. The metric parameter is a scorer object. Note that all scorers follow the convention that higher return values are better than lower return values. Thus metrics which measure the distance between the model and the data (i.e. loss functions), like max_error or mean_squared_error , will return the negated value of the metric. Custom scorer acronyms Since some of sklearn's scorers have quite long names and ATOM is all about lazy fast experimentation, the package provides acronyms for some of the most commonly used ones. These acronyms are case insensitive can be used for the metric parameter instead of the scorer's full name, e.g. atom.run('LR', metric='BA') will use balanced_accuracy . The available acronyms are: 'AP' for 'average_precision' 'BA' for 'balanced_accuracy' 'AUC' for 'roc_auc' 'EV' for 'explained_variance' 'ME' for 'max_error' 'MAE' for 'neg_mean_absolute_error' 'MSE' for 'neg_mean_squared_error' 'RMSE' for 'neg_root_mean_squared_error' 'MSLE' for 'neg_mean_squared_log_error' 'MEDAE' for 'neg_median_absolute_error' 'POISSON' for 'neg_mean_poisson_deviance' 'GAMMA' for 'neg_mean_gamma_deviance' Multi-metric runs Sometimes it is useful to measure the performance of the models in more than one way. ATOM lets you run the pipeline with multiple metrics at the same time. To do so, provide the metric parameter with a list of desired metrics, e.g. atom.run('LDA', metric=['r2', 'mse']) . If you provide metric functions, don't forget to also provide lists to the greater_is_better , needs_proba and needs_threshold parameters, where the n-th value in the list corresponds to the n-th function. If you leave them as a single value, that value will apply to every provided metric. When fitting multi-metric runs, the resulting scores will return a list of metrics. For example, if you provided three metrics to the pipeline, atom.knn.metric_bo could return [0.8734, 0.6672, 0.9001]. It is also important to note that only the first metric of a multi-metric run is used to evaluate every step of the bayesian optimization and to select the winning model. Tip Some plots let you choose which of the metrics to show using the metric parameter.","title":"Metric"},{"location":"user_guide/#hyperparameter-optimization","text":"In order to achieve maximum performance, we need to tune an estimator's hyperparameters before training it. ATOM provides hyperparameter tuning using a bayesian optimization (BO) approach implemented by skopt . The BO is optimized on the first metric provided with the metric parameter. Each step is either computed by cross-validation on the complete training set or by randomly splitting the training set every iteration into a (sub) training set and a validation set. This process can create some data leakage but ensures maximal use of the provided data. The test set, however, does not contain any leakage and will be used to determine the final score of every model. Note that, if the dataset is relatively small, the BO's best score can consistently be lower than the final score on the test set (despite the leakage) due to the considerable fewer instances on which it is trained. There are many possibilities to tune the BO to your liking. Use n_calls and n_initial_points to determine the number of iterations that are performed randomly at the start (exploration) and the number of iterations spent optimizing (exploitation). If n_calls is equal to n_initial_points , every iteration of the BO will select its hyperparameters randomly. This means the algorithm is technically performing a random search . Note The n_calls parameter includes the iterations in n_initial_points . Calling atom.run('LR', n_calls=20, n_intial_points=10) will run 20 iterations of which the first 10 are random. Other settings can be changed through the bo_params parameter, a dictionary where every key-value combination can be used to further customize the BO. By default, the hyperparameters and corresponding dimensions per model are predefined by ATOM. Use the dimensions key to use custom ones. Use an array for only one model and a dictionary with the model names as keys if there are multiple models in the pipeline. Note that the provided search space dimensions must be compliant with skopt's API. atom.run('LR', n_calls=10, bo_params={'dimensions': [Integer(100, 1000, name='max_iter')]}) The majority of skopt's callbacks to stop the optimizer early can be accessed through bo_params . You can include other callbacks using the callbacks key. atom.run('LR', n_calls=10, bo_params={'max_time': 1000, 'callbacks': custom_callback()}) You can also include other optimizer's parameters as key-value pairs. atom.run('LR', n_calls=10, bo_params={'acq_func': 'EI'})","title":"Hyperparameter optimization"},{"location":"user_guide/#bagging","text":"After fitting the estimator, you can asses the robustness of the model using bootstrap aggregating (bagging). This technique creates several new data sets selecting random samples from the training set (with replacement) and evaluates them on the test set. This way we get a distribution of the performance of the model. The number of sets can be chosen through the bagging parameter. Tip Use the plot_bagging method to plot the bagging scores in a convenient boxplot.","title":"Bagging"},{"location":"user_guide/#early-stopping","text":"XGBoost , LighGBM and CatBoost allow in-training evaluation. This means that the estimator is evaluated after every round of the training. Use the early_stopping key in bo_params to stop the training early if it didn't improve in the last early_stopping rounds. This can save the pipeline much time that would otherwise be wasted on an estimator that is unlikely to improve further. Note that this technique will be applied both during the BO and at the final fit on the complete training set. After fitting, the model will get the evals attribute, a dictionary of the train and test performances per round (also if early stopping wasn't applied). Tip Use the plot_evals method to plot the in-training evaluation on the train and test set.","title":"Early stopping"},{"location":"user_guide/#successive-halving","text":"Successive halving is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, we recommend only to use this technique with similar models, e.g. only using tree-based models. Use successive halving through the SuccessiveHalvingClassifier / SuccessiveHalvingRegressor classes or from atom via the successive_halving method. After running the pipeline, the results attribute will be multi-index, where the first index indicates the iteration and the second the model's acronym. Tip Use the plot_successive_halving method to see every model's performance per iteration of the successive halving.","title":"Successive halving"},{"location":"user_guide/#train-sizing","text":"When training models, there is usually a trade-off between model performance and computation time that is regulated by the number of samples in the training set. Train sizing can be used to create insights in this trade-off and help determine the optimal size of the training set, fitting the models multiple times, ever increasing the number of samples in the training set. Use train sizing through the TrainSizingClassifier / TrainSizingRegressor classes or from atom via the train_sizing method. The number of iterations and the number of samples per training can be specified with the train_sizes parameter. After running the pipeline, the results attribute will be multi-index, where the first index indicates the iteration and the second the model's acronym. Tip Use the plot_learning_curve method to see the model's performance per size of the training set.","title":"Train sizing"},{"location":"user_guide/#predicting","text":"After running a successful pipeline, it is possible you would like to apply all used transformations onto new data, or make predictions using one of the trained models. Just like a sklearn estimator, you can call the prediction methods from a fitted training instance, e.g. atom.predict(X) . Calling the method without specifying a model will use the winning model in the pipeline (under attribute winner ). To use a different model, simply call the method from a model , e.g. atom.KNN.predict(X) . If called from atom , the prediction methods will transform the provided data through all the transformers in the pipeline before making the predictions. By default, this excludes outlier handling and balancing the dataset since these steps should only be applied on the training set. Use the method's kwargs to select which transformations to use in every call. The available prediction methods are a selection of the most common methods for estimators in sklearn's API: transform Transform new data through all the pre-processing steps in the pipeline. predict Transform the data and make predictions on new data. predict_proba Transform the data and make probabilistic predictions on new data. predict_log_proba Transform the data and make logarithmic probability predictions on new data. decision_function Transform the data and evaluate the decision function on new data. score Transform the data and return the model's score on new data. Except for transform, the prediction methods can be calculated on the train and test set. You can access them through the model 's prediction attributes , e.g. atom.mnb.predict_train or atom.mnb.predict_test . Keep in mind that the results are not calculated until the attribute is called for the first time. This mechanism avoids having to calculate attributes that are never used, saving time and memory. Note Many of the plots use the prediction attributes. This can considerably increase the size of the class for large datasets. Use the reset_prediction_attributes method if you need to free some memory!","title":"Predicting"},{"location":"user_guide/#plots","text":"After fitting the models to the data, it's time to analyze the results. ATOM provides many plotting methods to compare the model performances. Descriptions and examples can be found in the API section. ATOM uses the packages matplotlib , seaborn and shap for plotting. The plot methods can be called from a training directly, e.g. atom.plot_roc() , or from one of the models , e.g. atom.LGB.plot_roc() . If called from training , it will make the plot for all models in the pipeline. This can be useful to compare the results of multiple models. If called from a model , it will make the plot for only that model. Use this option if you want information just for that specific model or to make a plot less crowded.","title":"Plots"},{"location":"user_guide/#parameters","text":"Apart from the plot-specific parameters they may have, all plots have four parameters in common: The title parameter allows you to add a custom title to the plot. The figsize parameter adjust the plot's size. The filename parameter is used to save the plot. The display parameter determines whether the plot is rendered.","title":"Parameters"},{"location":"user_guide/#aesthetics","text":"The plot aesthetics can be customized using the plot attributes, e.g. atom.style = 'white' . These attributes can be called from any instance with plotting methods. Note that the plot attributes are attached to the class and not the instance. This means that changing the attribute will also change it for all other instances in the module. ATOM's default values are: style: 'darkgrid' palette: 'GnBu_r_d' title_fontsize: 20 label_fontsize: 16 tick_fontsize: 12","title":"Aesthetics"},{"location":"user_guide/#shap","text":"The SHAP (SHapley Additive exPlanations) python package uses a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. ATOM implements methods to plot 4 of shap's plotting functions directly from its API. The explainer will be chosen automatically based on the model's type. For kernelExplainer, the data used to estimate the expected values is the complete training set when <100 rows, else its summarized with a set of 10 weighted K-means, each weighted by the number of points they represent. The four plots are: force_plot , dependence_plot , summary_plot and decision_plot . Since the plots are not made by ATOM, we can't draw multiple models in the same figure. Selecting more than one model will raise an exception. To avoid this, call the plot from a model , e.g. atom.xgb.force_plot() . Note You can recognize the SHAP plots by the fact that they end (instead of start) with plot.","title":"SHAP"},{"location":"user_guide/#available-plots","text":"A list of available plots can be find hereunder. Note that not all plots can be called from every class and that their availability can depend on the task at hand. plot_correlation Plot the data's correlation matrix. plot_pipeline Plot a diagram of every estimator in atom's pipeline. plot_pca Plot the explained variance ratio vs the number of components. plot_components Plot the explained variance ratio per components. plot_rfecv Plot the RFECV results. plot_successive_halving Plot of the models' scores per iteration of the successive halving. plot_learning_curve Plot the model's learning curve. plot_bagging Plot a boxplot of the bagging's results. plot_bo Plot the bayesian optimization scoring. plot_evals Plot evaluation curves for the train and test set. plot_roc Plot the Receiver Operating Characteristics curve. plot_prc Plot the precision-recall curve. plot_permutation_importance Plot the feature permutation importance of models. plot_feature_importance Plot a tree-based model's feature importance. plot_partial_dependence Plot the partial dependence of features. plot_errors Plot a model's prediction errors. plot_residuals Plot a model's residuals. plot_confusion_matrix Plot a model's confusion matrix. plot_threshold Plot a metric's performance against threshold values. plot_probabilities Plot the probability distribution of the categories in the target column. plot_calibration Plot the calibration curve for a binary classifier. plot_gains Plot the cumulative gains curve. plot_lift Plot the lift curve. force_plot Plot SHAP's force plot. dependence_plot Plot SHAP's dependence plot. summary_plot Plot SHAP's summary plot. decision_plot Plot SHAP's decision plot.","title":"Available plots"},{"location":"API/models/","text":"Models After fitting, every model class is attached to the training instance as an attribute. We refer to these \"subclasses\" as models (see the nomenclature ). The classes contain a variety of attributes and methods to help you understand how the underlying estimator performed. They can be accessed using the models' acronyms , e.g. atom.LGB to access LightGBM's model . The available models and their corresponding acronyms are: 'GP' for Gaussian Process 'GNB' for Gaussian Naive Bayes 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'OLS' for Ordinary Least Squares 'Ridge' for Ridge classification/regression 'Lasso' for Lasso regression 'EN' for Elastic Net regression 'BR' for Bayesian Regression 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for Decision Tree 'Bag' for Bagging 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost 'LGB' for LightGBM 'CatB' for CatBoost 'lSVM' for Linear-SVM 'kSVM' for Kernel-SVM 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron Tip You can also use lowercase to call the models , e.g. atom.lgb.plot_roc() . Warning The models should not be initialized by the user! Only use them through the training instances. Attributes Data attributes You can use the same data attributes as the training instances to check the dataset that was used to fit a particular model. These can differ from each other if the model needs scaled features and the data wasn't already scaled. Note that, unlike with the training instances, the data can not be updated from the models (i.e. the data attributes have no @setter ). Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. Utility attributes Attributes: bo: pd.DataFrame Dataframe containing the information of every step taken by the BO. Columns include: 'params': Parameters used in the model. 'model': Model used for this iteration (fitted on last cross-validation). 'score': Score of the chosen metric. List of scores for multi-metric. 'time_iteration': Time spent on this iteration. 'time': Total ime spent since the start of the BO. best_params: dict Dictionary of the best combination of hyperparameters found by the BO. estimator: class Estimator instance with the best combination of hyperparameters fitted on the complete training set. time_bo: str Time it took to run the bayesian optimization algorithm. metric_bo: float or list Best metric score(s) on the BO. time_fit: str Time it took to train the model on the complete training set and calculate the metric(s) on the test set. metric_train: float or list Metric score(s) on the training set. metric_test: float or list Metric score(s) on the test set. evals: dict Dictionary of the metric calculated during training. The metric is provided by the model's package and is different for every model and every task. Only for models that allow in-training evaluation (XGB, LGB, CatB). Available keys: 'metric': Name of the metric. 'train': List of scores calculated on the training set. 'test': List of scores calculated on the test set. metric_bagging: list Array of the bagging's results. mean_bagging: float Mean of the bagging's results. std_bagging: float Standard deviation of the bagging's results. Prediction attributes The prediction attributes are not calculated until the attribute is called for the first time. This mechanism avoids having to calculate attributes that are never used, saving time and memory. Prediction attributes: predict_train: np.ndarray Predictions of the model on the training set. predict_test: np.ndarray Predictions of the model on the test set. predict_proba_train: np.ndarray Predicted probabilities of the model on the training set. Only for estimators with a predict_proba method. predict_proba_test: np.ndarray Predicted probabilities of the model on the test set. Only for estimators with a predict_proba method. predict_log_proba_train: np.ndarray Predicted log probabilities of the model on the training set. Only for estimators with a predict_proba method. predict_log_proba_test: np.ndarray Predicted log probabilities of the model on the test set. Only for estimators with a predict_proba method. decision_function_train: np.ndarray Decision function scores on the training set. Only for estimators with a decision_function method. decision_function_test: np.ndarray Decision function scores on the test set. Only for estimators with a decision_function method. score_train: np.float64 Model's score on the training set. score_test: np.float64 Model's score on the test set. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods The majority of the plots and prediction methods can be called directly from the models , e.g. atom.xgb.plot_roc() or atom.xgb.predict_proba(X) . The remaining utility methods can be found hereunder: calibrate Calibrate the model. reset_prediction_attributes Clear all the prediction attributes. scoring Get the scoring of a specific metric on the test set. save_estimator Save the estimator to a pickle file. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done using the CalibratedClassifierCV class from sklearn. The estimator will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method reset_prediction_attributes () [source] Clear all the prediction attributes. Use this method to free some memory before saving the class. method scoring (metric=None, dataset='test') [source] Get the scoring of a specific metric on the test set. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method save_estimator (filename=None) [source] Save the estimator to a pickle file. Parameters: filename: str or None, optional (default=None) Name of the file to save. If None or 'auto', the default name is used.","title":"Models"},{"location":"API/models/#models","text":"After fitting, every model class is attached to the training instance as an attribute. We refer to these \"subclasses\" as models (see the nomenclature ). The classes contain a variety of attributes and methods to help you understand how the underlying estimator performed. They can be accessed using the models' acronyms , e.g. atom.LGB to access LightGBM's model . The available models and their corresponding acronyms are: 'GP' for Gaussian Process 'GNB' for Gaussian Naive Bayes 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'OLS' for Ordinary Least Squares 'Ridge' for Ridge classification/regression 'Lasso' for Lasso regression 'EN' for Elastic Net regression 'BR' for Bayesian Regression 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for Decision Tree 'Bag' for Bagging 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost 'LGB' for LightGBM 'CatB' for CatBoost 'lSVM' for Linear-SVM 'kSVM' for Kernel-SVM 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron Tip You can also use lowercase to call the models , e.g. atom.lgb.plot_roc() . Warning The models should not be initialized by the user! Only use them through the training instances.","title":"Models"},{"location":"API/models/#attributes","text":"","title":"Attributes"},{"location":"API/models/#data-attributes","text":"You can use the same data attributes as the training instances to check the dataset that was used to fit a particular model. These can differ from each other if the model needs scaled features and the data wasn't already scaled. Note that, unlike with the training instances, the data can not be updated from the models (i.e. the data attributes have no @setter ). Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column.","title":"Data attributes"},{"location":"API/models/#utility-attributes","text":"Attributes: bo: pd.DataFrame Dataframe containing the information of every step taken by the BO. Columns include: 'params': Parameters used in the model. 'model': Model used for this iteration (fitted on last cross-validation). 'score': Score of the chosen metric. List of scores for multi-metric. 'time_iteration': Time spent on this iteration. 'time': Total ime spent since the start of the BO. best_params: dict Dictionary of the best combination of hyperparameters found by the BO. estimator: class Estimator instance with the best combination of hyperparameters fitted on the complete training set. time_bo: str Time it took to run the bayesian optimization algorithm. metric_bo: float or list Best metric score(s) on the BO. time_fit: str Time it took to train the model on the complete training set and calculate the metric(s) on the test set. metric_train: float or list Metric score(s) on the training set. metric_test: float or list Metric score(s) on the test set. evals: dict Dictionary of the metric calculated during training. The metric is provided by the model's package and is different for every model and every task. Only for models that allow in-training evaluation (XGB, LGB, CatB). Available keys: 'metric': Name of the metric. 'train': List of scores calculated on the training set. 'test': List of scores calculated on the test set. metric_bagging: list Array of the bagging's results. mean_bagging: float Mean of the bagging's results. std_bagging: float Standard deviation of the bagging's results.","title":"Utility attributes"},{"location":"API/models/#prediction-attributes","text":"The prediction attributes are not calculated until the attribute is called for the first time. This mechanism avoids having to calculate attributes that are never used, saving time and memory. Prediction attributes: predict_train: np.ndarray Predictions of the model on the training set. predict_test: np.ndarray Predictions of the model on the test set. predict_proba_train: np.ndarray Predicted probabilities of the model on the training set. Only for estimators with a predict_proba method. predict_proba_test: np.ndarray Predicted probabilities of the model on the test set. Only for estimators with a predict_proba method. predict_log_proba_train: np.ndarray Predicted log probabilities of the model on the training set. Only for estimators with a predict_proba method. predict_log_proba_test: np.ndarray Predicted log probabilities of the model on the test set. Only for estimators with a predict_proba method. decision_function_train: np.ndarray Decision function scores on the training set. Only for estimators with a decision_function method. decision_function_test: np.ndarray Decision function scores on the test set. Only for estimators with a decision_function method. score_train: np.float64 Model's score on the training set. score_test: np.float64 Model's score on the test set.","title":"Prediction attributes"},{"location":"API/models/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/models/#methods","text":"The majority of the plots and prediction methods can be called directly from the models , e.g. atom.xgb.plot_roc() or atom.xgb.predict_proba(X) . The remaining utility methods can be found hereunder: calibrate Calibrate the model. reset_prediction_attributes Clear all the prediction attributes. scoring Get the scoring of a specific metric on the test set. save_estimator Save the estimator to a pickle file. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done using the CalibratedClassifierCV class from sklearn. The estimator will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method reset_prediction_attributes () [source] Clear all the prediction attributes. Use this method to free some memory before saving the class. method scoring (metric=None, dataset='test') [source] Get the scoring of a specific metric on the test set. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method save_estimator (filename=None) [source] Save the estimator to a pickle file. Parameters: filename: str or None, optional (default=None) Name of the file to save. If None or 'auto', the default name is used.","title":"Methods"},{"location":"API/ATOM/atomclassifier/","text":"ATOMClassifier class atom.api. ATOMClassifier (X, y=-1, n_rows=1, test_size=0.2, logger=None, n_jobs=1, warnings=True, verbose=0, random_state=None) [source] ATOMClassifier is ATOM's wrapper for binary and multiclass classification tasks. Use this class to easily apply all data transformations and model management provided by the package on a given dataset. Note that contrary to scikit-learn's API, the ATOMClassifier object already contains the dataset on which we want to perform the analysis. Calling a method will automatically apply it on the dataset it contains. The class initializer always calls StandardCleaner with default parameters. The following data types can't (yet) be handled properly and are therefore removed: 'datetime64', 'datetime64[ns]', 'timedelta[ns]'. You can predict , plot and call any model from the ATOMClassifier instance. Read more in the user guide . Parameters: X: dict, sequence, np.array or pd.DataFrame Dataset containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Position of the target column in X. The default value selects the last column. If string: Name of the target column in X Else: Data target column with shape=(n_samples,) n_rows: int or float, optional (default=1) if <=1: Fraction of the data to use. if >1: Number of rows to use. test_size: float, optional (default=0.2) Split fraction for the training and test set. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. warnings: bool or str, optional (default=True) If True: Default warning action (equal to 'default' when string). If False: Suppress all warnings (equal to 'ignore' when string). If str: One of the possible actions in python's warnings environment. Note that changing this parameter will affect the PYTHONWARNINGS environment. Note that ATOM can't manage warnings that go directly from C++ code to the stdout/stderr. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' for default name. If class: python Logger object'. Note that warnings will not be saved to the logger in any case. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The dataset within ATOM's pipeline can be accessed at any time through multiple properties, e.g. calling atom.train will return the training set. The data can also be changed through these properties, e.g. atom.test = atom.test.drop(0) will drop the first row from the test set. This will also update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. mapping: dict Dictionary of the target categories mapped to their respective encoded integer. missing: pd.Series Returns columns with number of missing values. n_missing: int Number of columns with missing values. categorical: list Returns columns with categorical features. n_categorical: int Number of columns with categorical features. scaled: bool Returns whether the feature set is scaled. Utility attributes Attributes: profile: ProfileReport Profile created by pandas-profiling after calling the report method. genetic_features: pd.DataFrame Dataframe of the non-linear features created by the feature_generation method. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. collinear: pd.DataFrame Dataframe of the collinear features removed by the feature_selection method. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Utility methods The ATOM class contains a variety of methods to help you handle the data and inspect the pipeline. calibrate Calibrate the winning model. clear Remove a model from the pipeline. log Save information to the logger and print to stdout. report Get an extensive profile analysis of the data. save Save the ATOMClassifier instance to a pickle file. scoring Print the scoring of the models for a specific metric. stats Print out a list of basic statistics on the dataset. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method report (dataset='dataset', n_rows=None, filename=None) [source] Get an extensive profile analysis of the data. The report is rendered in HTML5 and CSS3 and saved to the profile attribute. Note that this method can be slow for n_rows > 10k. Parameters: df: str, optional (default='dataset') Name of the data set to get the profile from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows to process. None for all rows. filename: str or None, optional (default=None) Name of the file when saved (as .html). None to not save anything. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method stats () [source] Print out a list of basic information on the dataset. Data cleaning ATOM provides data cleaning methods to scale your features and handle missing values, categorical columns, outliers and unbalanced datasets. Calling on one of them will automatically apply the method on the dataset in the pipeline. Tip Use the report method to examine the data and help you determine suitable parameters for the data cleaning methods. scale Scale all the features to mean=1 and std=0. impute Handle missing values in the dataset. encode Encode categorical features. outliers Remove or replace outliers in the training set. balance Balance the target categories in the training set. method scale () [source] Scale the features to mean=1 and std=0. method impute (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None) [source] Handle missing values according to the selected strategy. Also removes rows and columns with too many missing values. The imputer is fitted only on the training set to avoid data leakage. See Imputer for a description of the parameters. Note that since the Imputer can remove rows from both train and test set, the set's sizes may change to keep ATOM's test_size ratio. method encode (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value 'other' in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in 'ifu'. Will raise an error if it encounters missing values or unknown categories when transforming. The encoder is fitted only on the training set to avoid data leakage. See Encoder for a description of the parameters. method outliers (strategy='drop', max_sigma=3, include_target=False) [source] Remove or replace outliers in the training set. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Only outliers from the training set are removed to maintain an original sample of target values in the test set. Ignores categorical columns. See Outliers for a description of the parameters. method balance (strategy='ADASYN', **kwargs) [source] Balance the number of instances per target category in the training set. Only the training set is balanced in order to maintain the original distribution of target categories in the test set. See Balancer for a description of the parameters. Feature engineering To further pre-process the data you can create new non-linear features transforming the existing ones or, if your dataset is too large, remove features using one of the provided strategies. feature_generation Create new features from combinations of existing ones. feature_selection Remove features according to the selected strategy. method feature_generation (strategy='DFS', n_features=None, generations=20, population=500, operators=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. See FeatureGenerator for a description of the parameters. Attributes created by the class are attached to the ATOM instance. method feature_selection (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Also removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. See FeatureSelector for a description of the parameters. Plotting methods and attributes created by the class are attached to the instance. Note When strategy='univariate' and solver=None, f_classif will be used as default solver. When strategy is one of 'SFM', 'RFE' or 'RFECV' and the solver is one of ATOM's models, the algorithm will automatically select the classifier (no need to add _class to the solver). When strategy is one of 'SFM', 'RFE' or 'RFECV' and solver=None, ATOM will use the winning model (if it exists) as solver. When strategy='RFECV', ATOM will use the metric in the pipeline (if it exists) as the scoring parameter (only if not specified manually). Training The training methods are where the models are fitted to the data and their performance is evaluated according to the selected metric. ATOMClassifier contains three methods to call the training classes from the ATOM package. All relevant attributes and methods from the training classes are attached to ATOMClassifier for convenience. These include the errors, winner and results attributes, the models , and the prediction and plotting methods. run Fit the models to the data in a direct fashion. successive_halving Fit the models to the data in a successive halving fashion. train_sizing Fit the models to the data in a train sizing fashion. method run (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainerClassifier instance. Using this class through atom allows subsequent runs with different models without losing previous information. method successive_halving (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a SuccessiveHalvingClassifier instance. method train_sizing (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainSizingClassifier instance. Example from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier X, y = load_breast_cancer(return_X_y=True) # Initialize class atom = ATOMClassifier(X, y, logger='auto', n_jobs=2, verbose=2) # Apply data cleaning methods atom.outliers(strategy='min_max', max_sigma=2) atom.balance(strategy='smote', sampling_strategy=0.7) # Fit the models to the data atom.run(models=['QDA', 'CatB'], metric='precision', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Analyze the results print(f\"The winning model is: {atom.winner.name}\") print(atom.results) # Make some plots atom.palette = 'Blues' atom.plot_roc(figsize=(9, 6), filename='roc.png') atom.CatB.plot_feature_importance(filename='catboost_feature_importance.png') # Run an extra model atom.run(models='LR', metric='precision', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Get the predictions for the best model on new data predictions = atom.predict(X_new)","title":"ATOMClassifier"},{"location":"API/ATOM/atomclassifier/#atomclassifier","text":"class atom.api. ATOMClassifier (X, y=-1, n_rows=1, test_size=0.2, logger=None, n_jobs=1, warnings=True, verbose=0, random_state=None) [source] ATOMClassifier is ATOM's wrapper for binary and multiclass classification tasks. Use this class to easily apply all data transformations and model management provided by the package on a given dataset. Note that contrary to scikit-learn's API, the ATOMClassifier object already contains the dataset on which we want to perform the analysis. Calling a method will automatically apply it on the dataset it contains. The class initializer always calls StandardCleaner with default parameters. The following data types can't (yet) be handled properly and are therefore removed: 'datetime64', 'datetime64[ns]', 'timedelta[ns]'. You can predict , plot and call any model from the ATOMClassifier instance. Read more in the user guide . Parameters: X: dict, sequence, np.array or pd.DataFrame Dataset containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Position of the target column in X. The default value selects the last column. If string: Name of the target column in X Else: Data target column with shape=(n_samples,) n_rows: int or float, optional (default=1) if <=1: Fraction of the data to use. if >1: Number of rows to use. test_size: float, optional (default=0.2) Split fraction for the training and test set. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. warnings: bool or str, optional (default=True) If True: Default warning action (equal to 'default' when string). If False: Suppress all warnings (equal to 'ignore' when string). If str: One of the possible actions in python's warnings environment. Note that changing this parameter will affect the PYTHONWARNINGS environment. Note that ATOM can't manage warnings that go directly from C++ code to the stdout/stderr. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' for default name. If class: python Logger object'. Note that warnings will not be saved to the logger in any case. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"ATOMClassifier"},{"location":"API/ATOM/atomclassifier/#attributes","text":"","title":"Attributes"},{"location":"API/ATOM/atomclassifier/#data-attributes","text":"The dataset within ATOM's pipeline can be accessed at any time through multiple properties, e.g. calling atom.train will return the training set. The data can also be changed through these properties, e.g. atom.test = atom.test.drop(0) will drop the first row from the test set. This will also update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. mapping: dict Dictionary of the target categories mapped to their respective encoded integer. missing: pd.Series Returns columns with number of missing values. n_missing: int Number of columns with missing values. categorical: list Returns columns with categorical features. n_categorical: int Number of columns with categorical features. scaled: bool Returns whether the feature set is scaled.","title":"Data attributes"},{"location":"API/ATOM/atomclassifier/#utility-attributes","text":"Attributes: profile: ProfileReport Profile created by pandas-profiling after calling the report method. genetic_features: pd.DataFrame Dataframe of the non-linear features created by the feature_generation method. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. collinear: pd.DataFrame Dataframe of the collinear features removed by the feature_selection method. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/ATOM/atomclassifier/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/ATOM/atomclassifier/#utility-methods","text":"The ATOM class contains a variety of methods to help you handle the data and inspect the pipeline. calibrate Calibrate the winning model. clear Remove a model from the pipeline. log Save information to the logger and print to stdout. report Get an extensive profile analysis of the data. save Save the ATOMClassifier instance to a pickle file. scoring Print the scoring of the models for a specific metric. stats Print out a list of basic statistics on the dataset. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method report (dataset='dataset', n_rows=None, filename=None) [source] Get an extensive profile analysis of the data. The report is rendered in HTML5 and CSS3 and saved to the profile attribute. Note that this method can be slow for n_rows > 10k. Parameters: df: str, optional (default='dataset') Name of the data set to get the profile from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows to process. None for all rows. filename: str or None, optional (default=None) Name of the file when saved (as .html). None to not save anything. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method stats () [source] Print out a list of basic information on the dataset.","title":"Utility methods"},{"location":"API/ATOM/atomclassifier/#data-cleaning","text":"ATOM provides data cleaning methods to scale your features and handle missing values, categorical columns, outliers and unbalanced datasets. Calling on one of them will automatically apply the method on the dataset in the pipeline. Tip Use the report method to examine the data and help you determine suitable parameters for the data cleaning methods. scale Scale all the features to mean=1 and std=0. impute Handle missing values in the dataset. encode Encode categorical features. outliers Remove or replace outliers in the training set. balance Balance the target categories in the training set. method scale () [source] Scale the features to mean=1 and std=0. method impute (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None) [source] Handle missing values according to the selected strategy. Also removes rows and columns with too many missing values. The imputer is fitted only on the training set to avoid data leakage. See Imputer for a description of the parameters. Note that since the Imputer can remove rows from both train and test set, the set's sizes may change to keep ATOM's test_size ratio. method encode (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value 'other' in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in 'ifu'. Will raise an error if it encounters missing values or unknown categories when transforming. The encoder is fitted only on the training set to avoid data leakage. See Encoder for a description of the parameters. method outliers (strategy='drop', max_sigma=3, include_target=False) [source] Remove or replace outliers in the training set. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Only outliers from the training set are removed to maintain an original sample of target values in the test set. Ignores categorical columns. See Outliers for a description of the parameters. method balance (strategy='ADASYN', **kwargs) [source] Balance the number of instances per target category in the training set. Only the training set is balanced in order to maintain the original distribution of target categories in the test set. See Balancer for a description of the parameters.","title":"Data cleaning"},{"location":"API/ATOM/atomclassifier/#feature-engineering","text":"To further pre-process the data you can create new non-linear features transforming the existing ones or, if your dataset is too large, remove features using one of the provided strategies. feature_generation Create new features from combinations of existing ones. feature_selection Remove features according to the selected strategy. method feature_generation (strategy='DFS', n_features=None, generations=20, population=500, operators=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. See FeatureGenerator for a description of the parameters. Attributes created by the class are attached to the ATOM instance. method feature_selection (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Also removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. See FeatureSelector for a description of the parameters. Plotting methods and attributes created by the class are attached to the instance. Note When strategy='univariate' and solver=None, f_classif will be used as default solver. When strategy is one of 'SFM', 'RFE' or 'RFECV' and the solver is one of ATOM's models, the algorithm will automatically select the classifier (no need to add _class to the solver). When strategy is one of 'SFM', 'RFE' or 'RFECV' and solver=None, ATOM will use the winning model (if it exists) as solver. When strategy='RFECV', ATOM will use the metric in the pipeline (if it exists) as the scoring parameter (only if not specified manually).","title":"Feature engineering"},{"location":"API/ATOM/atomclassifier/#training","text":"The training methods are where the models are fitted to the data and their performance is evaluated according to the selected metric. ATOMClassifier contains three methods to call the training classes from the ATOM package. All relevant attributes and methods from the training classes are attached to ATOMClassifier for convenience. These include the errors, winner and results attributes, the models , and the prediction and plotting methods. run Fit the models to the data in a direct fashion. successive_halving Fit the models to the data in a successive halving fashion. train_sizing Fit the models to the data in a train sizing fashion. method run (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainerClassifier instance. Using this class through atom allows subsequent runs with different models without losing previous information. method successive_halving (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a SuccessiveHalvingClassifier instance. method train_sizing (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainSizingClassifier instance.","title":"Training"},{"location":"API/ATOM/atomclassifier/#example","text":"from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier X, y = load_breast_cancer(return_X_y=True) # Initialize class atom = ATOMClassifier(X, y, logger='auto', n_jobs=2, verbose=2) # Apply data cleaning methods atom.outliers(strategy='min_max', max_sigma=2) atom.balance(strategy='smote', sampling_strategy=0.7) # Fit the models to the data atom.run(models=['QDA', 'CatB'], metric='precision', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Analyze the results print(f\"The winning model is: {atom.winner.name}\") print(atom.results) # Make some plots atom.palette = 'Blues' atom.plot_roc(figsize=(9, 6), filename='roc.png') atom.CatB.plot_feature_importance(filename='catboost_feature_importance.png') # Run an extra model atom.run(models='LR', metric='precision', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Get the predictions for the best model on new data predictions = atom.predict(X_new)","title":"Example"},{"location":"API/ATOM/atomloader/","text":"ATOMLoader function ATOMLoader (filename=None, X=None, y=-1, transform_data=True, verbose=None) [source] Load a class instance from a pickle file. If the file is a training instance that was saved using save_data=False , you can load new data into it. If the file is an atom instance, you can also apply all data transformations in the pipeline to the provided data. Parameters: filename: str Name of the pickle file to load. X: dict, sequence, np.array, pd.DataFrame or None, optional (default=None) Data containing the features, with shape=(n_samples, n_features). Only use this parameter if the file is a training instance that was saved using save_data=False . See the save method. y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). This parameter is ignored if X=None. transform_data: bool, optional (default=True) Whether to transform the provided data through all the steps in the instance's pipeline. This parameter is ignored if the loaded file is not an atom instance. verbose: int or None, optional (default=None) Verbosity level of the transformations applied on the new data. If None, use the verbosity from the loaded instance. This parameter is ignored if the loaded file is not an atom instance. Example from atom import ATOMClassifier, ATOMLoader # Save an atom instance to a pickle file atom = ATOMClassifier(X, y) atom.encode(strategy='Helmert', max_onehot=12) atom.run('LR', metric='AP', n_calls=25, n_initial_points=10) atom.save('atom_lr', save_data=False) # Load the class and add the transformed data to the new instance atom_2 = ATOMLoader('atom_lr', X, y, verbose=0)","title":"ATOMLoader"},{"location":"API/ATOM/atomloader/#atomloader","text":"function ATOMLoader (filename=None, X=None, y=-1, transform_data=True, verbose=None) [source] Load a class instance from a pickle file. If the file is a training instance that was saved using save_data=False , you can load new data into it. If the file is an atom instance, you can also apply all data transformations in the pipeline to the provided data. Parameters: filename: str Name of the pickle file to load. X: dict, sequence, np.array, pd.DataFrame or None, optional (default=None) Data containing the features, with shape=(n_samples, n_features). Only use this parameter if the file is a training instance that was saved using save_data=False . See the save method. y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). This parameter is ignored if X=None. transform_data: bool, optional (default=True) Whether to transform the provided data through all the steps in the instance's pipeline. This parameter is ignored if the loaded file is not an atom instance. verbose: int or None, optional (default=None) Verbosity level of the transformations applied on the new data. If None, use the verbosity from the loaded instance. This parameter is ignored if the loaded file is not an atom instance.","title":"ATOMLoader"},{"location":"API/ATOM/atomloader/#example","text":"from atom import ATOMClassifier, ATOMLoader # Save an atom instance to a pickle file atom = ATOMClassifier(X, y) atom.encode(strategy='Helmert', max_onehot=12) atom.run('LR', metric='AP', n_calls=25, n_initial_points=10) atom.save('atom_lr', save_data=False) # Load the class and add the transformed data to the new instance atom_2 = ATOMLoader('atom_lr', X, y, verbose=0)","title":"Example"},{"location":"API/ATOM/atomregressor/","text":"ATOMRegressor class atom.api. ATOMRegressor (X, y=-1, n_rows=1, test_size=0.2, logger=None, n_jobs=1, warnings=True, verbose=0, random_state=None) [source] ATOMRegressor is ATOM's wrapper for regression tasks. Use this class to easily apply all data transformations and model management provided by the package on a given dataset. Note that contrary to scikit-learn's API, the ATOMRegressor object already contains the dataset on which we want to perform the analysis. Calling a method will automatically apply it on the dataset it contains. The class initializer always calls StandardCleaner with default parameters. The following data types can't (yet) be handled properly and are therefore removed: 'datetime64', 'datetime64[ns]', 'timedelta[ns]'. You can predict , plot and call any model from the ATOMRegressor instance. Read more in the user guide . Parameters: X: dict, sequence, np.array or pd.DataFrame Dataset containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Position of the target column in X. The default value selects the last column. If string: Name of the target column in X Else: Data target column with shape=(n_samples,) n_rows: int or float, optional (default=1) if <=1: Fraction of the data to use. if >1: Number of rows to use. test_size: float, optional (default=0.2) Split fraction for the training and test set. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. warnings: bool or str, optional (default=True) If True: Default warning action (equal to 'default' when string). If False: Suppress all warnings (equal to 'ignore' when string). If str: One of the possible actions in python's warnings environment. Note that changing this parameter will affect the PYTHONWARNINGS environment. Note that ATOM can't manage warnings that go directly from C++ code to the stdout/stderr. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' for default name. If class: python Logger object'. Note that warnings will not be saved to the logger in any case. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The dataset within ATOM's pipeline can be accessed at any time through multiple properties, e.g. calling atom.train will return the training set. The data can also be changed through these properties, e.g. atom.test = atom.test.drop(0) will drop the first row from the test set. This will also update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. Utility attributes Attributes: profile: ProfileReport Profile created by pandas-profiling after calling the report method. genetic_features: pd.DataFrame Dataframe of the non-linear features created by the feature_generation method. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. collinear: pd.DataFrame Dataframe of the collinear features removed by the feature_selection method. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Utility methods The ATOM class contains a variety of methods to help you handle the data and inspect the pipeline. clear Remove a model from the pipeline. log Save information to the logger and print to stdout. report Get an extensive profile analysis of the data. save Save the ATOMRegressor instance to a pickle file. scoring Print the scoring of the models for a specific metric. stats Print out a list of basic statistics on the dataset. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method report (dataset='dataset', n_rows=None, filename=None) [source] Get an extensive profile analysis of the data. The report is rendered in HTML5 and CSS3 and saved to the profile attribute. Note that this method can be slow for n_rows > 10k. Parameters: df: str, optional (default='dataset') Name of the data set to get the profile from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows to process. None for all rows. filename: str or None, optional (default=None) Name of the file when saved (as .html). None to not save anything. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method stats () [source] Print out a list of basic information on the dataset. Data cleaning ATOM provides data cleaning methods to scale your features and handle missing values, categorical columns and outliers. Calling on one of them will automatically apply the method on the dataset in the pipeline. Tip Use the report method to examine the data and help you determine suitable parameters for the data cleaning methods. scale Scale all the features to mean=1 and std=0. impute Handle missing values in the dataset. encode Encode categorical features. outliers Remove or replace outliers in the training set. method scale () [source] Scale the features to mean=1 and std=0. method impute (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None) [source] Handle missing values according to the selected strategy. Also removes rows and columns with too many missing values. The imputer is fitted only on the training set to avoid data leakage. See Imputer for a description of the parameters. Note that since the Imputer can remove rows from both train and test set, the set's sizes may change to keep ATOM's test_size ratio. method encode (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value 'other' in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in 'ifu'. Will raise an error if it encounters missing values or unknown categories when transforming. The encoder is fitted only on the training set to avoid data leakage. See Encoder for a description of the parameters. method outliers (strategy='drop', max_sigma=3, include_target=False) [source] Remove or replace outliers in the training set. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Only outliers from the training set are removed to maintain an original sample of target values in the test set. Ignores categorical columns. See Outliers for a description of the parameters. Feature engineering To further pre-process the data you can create new non-linear features transforming the existing ones or, if your dataset is too large, remove features using one of the provided strategies. feature_generation Create new features from combinations of existing ones. feature_selection Remove features according to the selected strategy. method feature_generation (strategy='DFS', n_features=None, generations=20, population=500, operators=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. See FeatureGenerator for a description of the parameters. Attributes created by the class are attached to the ATOM instance. method feature_selection (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Also removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. See FeatureSelector for a description of the parameters. Plotting methods and attributes created by the class are attached to the instance. Note When strategy='univariate' and solver=None, f_regression will be used as default solver. When strategy is one of 'SFM', 'RFE' or 'RFECV' and the solver is one of ATOM's models, the algorithm will automatically select the classifier (no need to add _reg to the solver). When strategy is one of 'SFM', 'RFE' or 'RFECV' and solver=None, ATOM will use the winning model (if it exists) as solver. When strategy='RFECV', ATOM will use the metric in the pipeline (if it exists) as the scoring parameter (only if not specified manually). Training The training methods are where the models are fitted to the data and their performance is evaluated according to the selected metric. ATOMRegressor contains three methods to call the training classes from the ATOM package. All relevant attributes and methods from the training classes are attached to ATOMRegressor for convenience. These include the errors, winner and results attributes, the models , and the prediction and plotting methods. run Fit the models to the data in a direct fashion. successive_halving Fit the models to the data in a successive halving fashion. train_sizing Fit the models to the data in a train sizing fashion. method run (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainerRegressor instance. Using this class through atom allows subsequent runs with different models without losing previous information. method successive_halving (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a SuccessiveHalvingRegressor instance. method train_sizing (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainSizingRegressor instance. Example from sklearn.datasets import load_boston from atom import ATOMRegressor X, y = load_boston(return_X_y=True) # Initialize class atom = ATOMRegressor(X, y, logger='auto', n_jobs=2, verbose=2) # Apply data cleaning methods atom.outliers(strategy='min_max', max_sigma=2, include_target=True) # Fit the models to the data atom.run(models=['OLS', 'BR', 'CatB'], metric='MSE', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Analyze the results print(f\"The winning model is: {atom.winner.name}\") print(atom.results) # Make some plots atom.palette = 'Blues' atom.plot_errors(figsize=(9, 6), filename='errors.png') atom.CatB.plot_feature_importance(filename='catboost_feature_importance.png') # Run an extra model atom.run(models='MLP', metric='MSE', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Get the predictions for the best model on new data predictions = atom.predict(X_new)","title":"ATOMRegressor"},{"location":"API/ATOM/atomregressor/#atomregressor","text":"class atom.api. ATOMRegressor (X, y=-1, n_rows=1, test_size=0.2, logger=None, n_jobs=1, warnings=True, verbose=0, random_state=None) [source] ATOMRegressor is ATOM's wrapper for regression tasks. Use this class to easily apply all data transformations and model management provided by the package on a given dataset. Note that contrary to scikit-learn's API, the ATOMRegressor object already contains the dataset on which we want to perform the analysis. Calling a method will automatically apply it on the dataset it contains. The class initializer always calls StandardCleaner with default parameters. The following data types can't (yet) be handled properly and are therefore removed: 'datetime64', 'datetime64[ns]', 'timedelta[ns]'. You can predict , plot and call any model from the ATOMRegressor instance. Read more in the user guide . Parameters: X: dict, sequence, np.array or pd.DataFrame Dataset containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Position of the target column in X. The default value selects the last column. If string: Name of the target column in X Else: Data target column with shape=(n_samples,) n_rows: int or float, optional (default=1) if <=1: Fraction of the data to use. if >1: Number of rows to use. test_size: float, optional (default=0.2) Split fraction for the training and test set. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. warnings: bool or str, optional (default=True) If True: Default warning action (equal to 'default' when string). If False: Suppress all warnings (equal to 'ignore' when string). If str: One of the possible actions in python's warnings environment. Note that changing this parameter will affect the PYTHONWARNINGS environment. Note that ATOM can't manage warnings that go directly from C++ code to the stdout/stderr. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' for default name. If class: python Logger object'. Note that warnings will not be saved to the logger in any case. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"ATOMRegressor"},{"location":"API/ATOM/atomregressor/#attributes","text":"","title":"Attributes"},{"location":"API/ATOM/atomregressor/#data-attributes","text":"The dataset within ATOM's pipeline can be accessed at any time through multiple properties, e.g. calling atom.train will return the training set. The data can also be changed through these properties, e.g. atom.test = atom.test.drop(0) will drop the first row from the test set. This will also update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column.","title":"Data attributes"},{"location":"API/ATOM/atomregressor/#utility-attributes","text":"Attributes: profile: ProfileReport Profile created by pandas-profiling after calling the report method. genetic_features: pd.DataFrame Dataframe of the non-linear features created by the feature_generation method. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. collinear: pd.DataFrame Dataframe of the collinear features removed by the feature_selection method. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/ATOM/atomregressor/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/ATOM/atomregressor/#utility-methods","text":"The ATOM class contains a variety of methods to help you handle the data and inspect the pipeline. clear Remove a model from the pipeline. log Save information to the logger and print to stdout. report Get an extensive profile analysis of the data. save Save the ATOMRegressor instance to a pickle file. scoring Print the scoring of the models for a specific metric. stats Print out a list of basic statistics on the dataset. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method report (dataset='dataset', n_rows=None, filename=None) [source] Get an extensive profile analysis of the data. The report is rendered in HTML5 and CSS3 and saved to the profile attribute. Note that this method can be slow for n_rows > 10k. Parameters: df: str, optional (default='dataset') Name of the data set to get the profile from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows to process. None for all rows. filename: str or None, optional (default=None) Name of the file when saved (as .html). None to not save anything. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method stats () [source] Print out a list of basic information on the dataset.","title":"Utility methods"},{"location":"API/ATOM/atomregressor/#data-cleaning","text":"ATOM provides data cleaning methods to scale your features and handle missing values, categorical columns and outliers. Calling on one of them will automatically apply the method on the dataset in the pipeline. Tip Use the report method to examine the data and help you determine suitable parameters for the data cleaning methods. scale Scale all the features to mean=1 and std=0. impute Handle missing values in the dataset. encode Encode categorical features. outliers Remove or replace outliers in the training set. method scale () [source] Scale the features to mean=1 and std=0. method impute (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None) [source] Handle missing values according to the selected strategy. Also removes rows and columns with too many missing values. The imputer is fitted only on the training set to avoid data leakage. See Imputer for a description of the parameters. Note that since the Imputer can remove rows from both train and test set, the set's sizes may change to keep ATOM's test_size ratio. method encode (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value 'other' in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in 'ifu'. Will raise an error if it encounters missing values or unknown categories when transforming. The encoder is fitted only on the training set to avoid data leakage. See Encoder for a description of the parameters. method outliers (strategy='drop', max_sigma=3, include_target=False) [source] Remove or replace outliers in the training set. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Only outliers from the training set are removed to maintain an original sample of target values in the test set. Ignores categorical columns. See Outliers for a description of the parameters.","title":"Data cleaning"},{"location":"API/ATOM/atomregressor/#feature-engineering","text":"To further pre-process the data you can create new non-linear features transforming the existing ones or, if your dataset is too large, remove features using one of the provided strategies. feature_generation Create new features from combinations of existing ones. feature_selection Remove features according to the selected strategy. method feature_generation (strategy='DFS', n_features=None, generations=20, population=500, operators=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. See FeatureGenerator for a description of the parameters. Attributes created by the class are attached to the ATOM instance. method feature_selection (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Also removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. See FeatureSelector for a description of the parameters. Plotting methods and attributes created by the class are attached to the instance. Note When strategy='univariate' and solver=None, f_regression will be used as default solver. When strategy is one of 'SFM', 'RFE' or 'RFECV' and the solver is one of ATOM's models, the algorithm will automatically select the classifier (no need to add _reg to the solver). When strategy is one of 'SFM', 'RFE' or 'RFECV' and solver=None, ATOM will use the winning model (if it exists) as solver. When strategy='RFECV', ATOM will use the metric in the pipeline (if it exists) as the scoring parameter (only if not specified manually).","title":"Feature engineering"},{"location":"API/ATOM/atomregressor/#training","text":"The training methods are where the models are fitted to the data and their performance is evaluated according to the selected metric. ATOMRegressor contains three methods to call the training classes from the ATOM package. All relevant attributes and methods from the training classes are attached to ATOMRegressor for convenience. These include the errors, winner and results attributes, the models , and the prediction and plotting methods. run Fit the models to the data in a direct fashion. successive_halving Fit the models to the data in a successive halving fashion. train_sizing Fit the models to the data in a train sizing fashion. method run (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainerRegressor instance. Using this class through atom allows subsequent runs with different models without losing previous information. method successive_halving (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a SuccessiveHalvingRegressor instance. method train_sizing (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainSizingRegressor instance.","title":"Training"},{"location":"API/ATOM/atomregressor/#example","text":"from sklearn.datasets import load_boston from atom import ATOMRegressor X, y = load_boston(return_X_y=True) # Initialize class atom = ATOMRegressor(X, y, logger='auto', n_jobs=2, verbose=2) # Apply data cleaning methods atom.outliers(strategy='min_max', max_sigma=2, include_target=True) # Fit the models to the data atom.run(models=['OLS', 'BR', 'CatB'], metric='MSE', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Analyze the results print(f\"The winning model is: {atom.winner.name}\") print(atom.results) # Make some plots atom.palette = 'Blues' atom.plot_errors(figsize=(9, 6), filename='errors.png') atom.CatB.plot_feature_importance(filename='catboost_feature_importance.png') # Run an extra model atom.run(models='MLP', metric='MSE', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Get the predictions for the best model on new data predictions = atom.predict(X_new)","title":"Example"},{"location":"API/data_cleaning/balancer/","text":"Balancer class atom.data_cleaning. Balancer (strategy='ADASYN', n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs) [source] Balance the number of rows per target category. Use only for classification tasks. This class can be accessed from atom through the balance method. Read more in the user guide . Parameters: strategy: str, optional (default='ADASYN') Type of algorithm to use for oversampling or undersampling. Choose from one of the estimators available in the imbalanced-learn package. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . **kwargs Additional keyword arguments passed to the strategy estimator. Attributes Attributes: : class Estimator instance (attribute name in all lowercase) used to oversample/undersample the data, e.g. balancer.adasyn for the default option. mapping: dict Dictionary of the target values mapped to their respective encoded integer. Methods fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y) [source] Oversample or undersample the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Balancer Estimator instance. method transform (X, y) [source] Oversample or undersample the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.balance(strategy='NearMiss', sampling_strategy=0.7, n_neighbors=10) or from atom.data_cleaning import Balancer balancer = Balancer(strategy='NearMiss', sampling_strategy=0.7, n_neighbors=10) X_train, y_train = balancer.transform(X_train, y_train)","title":"Balancer"},{"location":"API/data_cleaning/balancer/#balancer","text":"class atom.data_cleaning. Balancer (strategy='ADASYN', n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs) [source] Balance the number of rows per target category. Use only for classification tasks. This class can be accessed from atom through the balance method. Read more in the user guide . Parameters: strategy: str, optional (default='ADASYN') Type of algorithm to use for oversampling or undersampling. Choose from one of the estimators available in the imbalanced-learn package. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . **kwargs Additional keyword arguments passed to the strategy estimator.","title":"Balancer"},{"location":"API/data_cleaning/balancer/#attributes","text":"Attributes: : class Estimator instance (attribute name in all lowercase) used to oversample/undersample the data, e.g. balancer.adasyn for the default option. mapping: dict Dictionary of the target values mapped to their respective encoded integer.","title":"Attributes"},{"location":"API/data_cleaning/balancer/#methods","text":"fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y) [source] Oversample or undersample the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Balancer Estimator instance. method transform (X, y) [source] Oversample or undersample the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column.","title":"Methods"},{"location":"API/data_cleaning/balancer/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.balance(strategy='NearMiss', sampling_strategy=0.7, n_neighbors=10) or from atom.data_cleaning import Balancer balancer = Balancer(strategy='NearMiss', sampling_strategy=0.7, n_neighbors=10) X_train, y_train = balancer.transform(X_train, y_train)","title":"Example"},{"location":"API/data_cleaning/encoder/","text":"Encoder class atom.data_cleaning. Encoder (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None, verbose=0, logger=None, **kwargs) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value other in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in ifu . Will raise an error if it encounters missing values or unknown categories when transforming. This class can be accessed from atom through the encode method. Read more in the user guide . Parameters: strategy: str, optional (default='LeaveOneOut') Type of encoding to use for high cardinality features. Choose from one of the estimators available in the category-encoders package except for: OneHotEncoder: Use the max_onehot parameter. HashingEncoder: Incompatibility of APIs. max_onehot: int or None, optional (default=10) Maximum number of unique values in a feature to perform one-hot-encoding. If None, it will always use strategy when n_unique > 2. frac_to_other: float, optional (default=None) Categories with less occurrences than n_rows * fraction_to_other are replaced with the string other . If None, skip this step. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. **kwargs Additional keyword arguments passed to the strategy estimator. Tip Use atom 's categorical attribute for a list of the categorical columns in the dataset. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: Encoder Fitted instance of self. method fit_transform (X, y) [source] Fit the Encoder and return the encoded data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Encoder Estimator instance. method transform (X, y=None) [source] Encode the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Transformed feature set. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.encode(strategy='CatBoost', max_onehot=5) or from atom.data_cleaning import Encoder encoder = Encoder(strategy='CatBoost', max_onehot=5) encoder.fit(X_train, y_train) X = encoder.transform(X)","title":"Encoder"},{"location":"API/data_cleaning/encoder/#encoder","text":"class atom.data_cleaning. Encoder (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None, verbose=0, logger=None, **kwargs) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value other in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in ifu . Will raise an error if it encounters missing values or unknown categories when transforming. This class can be accessed from atom through the encode method. Read more in the user guide . Parameters: strategy: str, optional (default='LeaveOneOut') Type of encoding to use for high cardinality features. Choose from one of the estimators available in the category-encoders package except for: OneHotEncoder: Use the max_onehot parameter. HashingEncoder: Incompatibility of APIs. max_onehot: int or None, optional (default=10) Maximum number of unique values in a feature to perform one-hot-encoding. If None, it will always use strategy when n_unique > 2. frac_to_other: float, optional (default=None) Categories with less occurrences than n_rows * fraction_to_other are replaced with the string other . If None, skip this step. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. **kwargs Additional keyword arguments passed to the strategy estimator. Tip Use atom 's categorical attribute for a list of the categorical columns in the dataset.","title":"Encoder"},{"location":"API/data_cleaning/encoder/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: Encoder Fitted instance of self. method fit_transform (X, y) [source] Fit the Encoder and return the encoded data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Encoder Estimator instance. method transform (X, y=None) [source] Encode the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Transformed feature set.","title":"Methods"},{"location":"API/data_cleaning/encoder/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.encode(strategy='CatBoost', max_onehot=5) or from atom.data_cleaning import Encoder encoder = Encoder(strategy='CatBoost', max_onehot=5) encoder.fit(X_train, y_train) X = encoder.transform(X)","title":"Example"},{"location":"API/data_cleaning/imputer/","text":"Imputer class atom.data_cleaning. Imputer (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None, verbose=0, logger=None) [source] Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. This class can be accessed from atom through the impute method. Read more in the user guide . Parameters: strat_num: str, int or float, optional (default='drop') Imputing strategy for numerical columns. Choose from: 'drop': Drop rows containing missing values. 'mean': Impute with mean of column. 'median': Impute with median of column. 'knn': Impute using a K-Nearest Neighbors approach. 'most_frequent': Impute with most frequent value. int or float: Impute with provided numerical value. strat_cat: str, optional (default='drop') Imputing strategy for categorical columns. Choose from: 'drop': Drop rows containing missing values. 'most_frequent': Impute with most frequent value. str: Impute with provided string. min_frac_rows: float, optional (default=0.5) Minimum fraction of non-missing values in a row. If less, the row is removed. min_frac_cols: float, optional (default=0.5) Minimum fraction of non-missing values in a column. If less, the column is removed. missing: int, float or list, optional (default=None) List of values to treat as 'missing'. None to use the default values: [None, np.NaN, np.inf, -np.inf, '', '?', 'NA', 'nan', 'None', 'inf']. Note that np.NaN , None , np.inf and -np.inf will always be imputed since they are incompatible with most estimators. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Tip Use atom 's missing attribute for an overview of the missing values in the dataset. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: self: Imputer Fitted instance of self. method fit_transform (X, y=None) [source] Fit the Imputer and return the imputed data. Warning Leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: imputer Estimator instance. method transform (X, y=None) [source] Impute the data. Warning Leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,) Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='knn', strat_cat='drop', min_frac_cols=0.8) or from atom.data_cleaning import Imputer imputer = Imputer(strat_num='knn', strat_cat='drop', min_frac_cols=0.8) imputer.fit(X_train, y_train) X = imputer.transform(X)","title":"Imputer"},{"location":"API/data_cleaning/imputer/#imputer","text":"class atom.data_cleaning. Imputer (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None, verbose=0, logger=None) [source] Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. This class can be accessed from atom through the impute method. Read more in the user guide . Parameters: strat_num: str, int or float, optional (default='drop') Imputing strategy for numerical columns. Choose from: 'drop': Drop rows containing missing values. 'mean': Impute with mean of column. 'median': Impute with median of column. 'knn': Impute using a K-Nearest Neighbors approach. 'most_frequent': Impute with most frequent value. int or float: Impute with provided numerical value. strat_cat: str, optional (default='drop') Imputing strategy for categorical columns. Choose from: 'drop': Drop rows containing missing values. 'most_frequent': Impute with most frequent value. str: Impute with provided string. min_frac_rows: float, optional (default=0.5) Minimum fraction of non-missing values in a row. If less, the row is removed. min_frac_cols: float, optional (default=0.5) Minimum fraction of non-missing values in a column. If less, the column is removed. missing: int, float or list, optional (default=None) List of values to treat as 'missing'. None to use the default values: [None, np.NaN, np.inf, -np.inf, '', '?', 'NA', 'nan', 'None', 'inf']. Note that np.NaN , None , np.inf and -np.inf will always be imputed since they are incompatible with most estimators. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Tip Use atom 's missing attribute for an overview of the missing values in the dataset.","title":"Imputer"},{"location":"API/data_cleaning/imputer/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: self: Imputer Fitted instance of self. method fit_transform (X, y=None) [source] Fit the Imputer and return the imputed data. Warning Leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: imputer Estimator instance. method transform (X, y=None) [source] Impute the data. Warning Leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,) Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided.","title":"Methods"},{"location":"API/data_cleaning/imputer/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='knn', strat_cat='drop', min_frac_cols=0.8) or from atom.data_cleaning import Imputer imputer = Imputer(strat_num='knn', strat_cat='drop', min_frac_cols=0.8) imputer.fit(X_train, y_train) X = imputer.transform(X)","title":"Example"},{"location":"API/data_cleaning/outliers/","text":"Outliers class atom.data_cleaning. Outliers (strategy='drop', max_sigma=3, include_target=False, verbose=0, logger=None) [source] Remove or replace outliers in the data. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Ignores categorical columns. This class can be accessed from atom through the outliers method. Read more in the user guide . Parameters: strategy: int, float or str, optional (default='drop') Strategy to apply on the outliers. Choose from: 'drop': Drop any row with outliers. 'min_max': Replace the outlier with the min or max of the column. Any numerical value with which to replace the outliers. max_sigma: int or float, optional (default=3) Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. include_target: bool, optional (default=False) Whether to include the target column in the transformation. This can be useful for regression tasks. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Methods fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y=None) [source] Apply the outlier strategy on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Outliers Estimator instance. method transform (X, y=None) [source] Apply the outlier strategy on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Only returned if provided. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.outliers(strategy='min_max', max_sigma=2, include_target=True) or from atom.data_cleaning import Outliers outliers = Outliers(strategy='min_max', max_sigma=2, include_target=True) X_train, y_train = outliers.transform(X_train, y_train)","title":"Outliers"},{"location":"API/data_cleaning/outliers/#outliers","text":"class atom.data_cleaning. Outliers (strategy='drop', max_sigma=3, include_target=False, verbose=0, logger=None) [source] Remove or replace outliers in the data. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Ignores categorical columns. This class can be accessed from atom through the outliers method. Read more in the user guide . Parameters: strategy: int, float or str, optional (default='drop') Strategy to apply on the outliers. Choose from: 'drop': Drop any row with outliers. 'min_max': Replace the outlier with the min or max of the column. Any numerical value with which to replace the outliers. max_sigma: int or float, optional (default=3) Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. include_target: bool, optional (default=False) Whether to include the target column in the transformation. This can be useful for regression tasks. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object.","title":"Outliers"},{"location":"API/data_cleaning/outliers/#methods","text":"fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y=None) [source] Apply the outlier strategy on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Outliers Estimator instance. method transform (X, y=None) [source] Apply the outlier strategy on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Only returned if provided.","title":"Methods"},{"location":"API/data_cleaning/outliers/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.outliers(strategy='min_max', max_sigma=2, include_target=True) or from atom.data_cleaning import Outliers outliers = Outliers(strategy='min_max', max_sigma=2, include_target=True) X_train, y_train = outliers.transform(X_train, y_train)","title":"Example"},{"location":"API/data_cleaning/scaler/","text":"Scaler class atom.data_cleaning. Scaler (verbose=0, logger=None) [source] Scales data to mean=0 and std=1. This method is equal to sklearn's StandardScaler except that it returns a dataframe when provided. This class can be accessed from atom through the scale method. Read more in the user guide . Parameters: verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: self: Scaler Fitted instance of self. method fit_transform (X, y=None) [source] Fit the Scaler and return the scaled data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Scaled feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Scaler Estimator instance. method transform (X, y=None) [source] Scale the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Scaled feature set. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.scale() or from atom.data_cleaning import Scaler scaler = Scaler() scaler.fit(X_train) X = scaler.transform(X)","title":"Scaler"},{"location":"API/data_cleaning/scaler/#scaler","text":"class atom.data_cleaning. Scaler (verbose=0, logger=None) [source] Scales data to mean=0 and std=1. This method is equal to sklearn's StandardScaler except that it returns a dataframe when provided. This class can be accessed from atom through the scale method. Read more in the user guide . Parameters: verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object.","title":"Scaler"},{"location":"API/data_cleaning/scaler/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: self: Scaler Fitted instance of self. method fit_transform (X, y=None) [source] Fit the Scaler and return the scaled data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Scaled feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Scaler Estimator instance. method transform (X, y=None) [source] Scale the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Scaled feature set.","title":"Methods"},{"location":"API/data_cleaning/scaler/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.scale() or from atom.data_cleaning import Scaler scaler = Scaler() scaler.fit(X_train) X = scaler.transform(X)","title":"Example"},{"location":"API/data_cleaning/standard_cleaner/","text":"StandardCleaner class atom.data_cleaning. StandardCleaner (prohibited_types=[], strip_categorical=True, maximum_cardinality=True, minimum_cardinality=True, missing_target=True, map_target=None, verbose=0, logger=None) [source] Performs standard data cleaning steps on a dataset. Use the parameters to choose which transformations to perform. The available steps are: Remove columns with prohibited data types. Strip categorical features from white spaces. Remove categorical columns with maximal cardinality. Remove columns with minimum cardinality. Remove rows with missing values in the target column. Label-encode the target column. This class is automatically called when initializing atom . Read more in the user guide . Parameters: prohibited_types: str or sequence, optional (default=[]) Columns with any of these types will be removed from the dataset. strip_categorical: bool, optional (default=True) Whether to strip the spaces from values in the categorical columns. maximum_cardinality: bool, optional (default=True) Whether to remove categorical columns with maximum cardinality, i.e. the number of unique values is equal to the number of instances. Usually the case for names, IDs, etc... minimum_cardinality: bool, optional (default=True) Whether to remove columns with minimum cardinality, i.e. all values in the column are the same. missing_target: bool, optional (default=True) Whether to remove rows with missing values in the target column. Ignored if y is not provided. map_target: bool or None, optional (default=None) Whether to map the target column to numerical values. Should only be used for classification tasks. If None, infer task from the provided target column and set to True if it is classification. Ignored if y is not provided or if it already consists of ordered integers. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Attributes Attributes: mapping: dict Dictionary of the target values mapped to their respective encoded integer. Only available if map_target was performed. Methods fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y=None) [source] Apply the data cleaning steps on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: StandardCleaner Estimator instance. method transform (X, y=None) [source] Apply the data cleaning steps on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. Example from atom import ATOMClassifier # ATOM's initializer calls StandardCleaner automatically atom = ATOMClassifier(X, y) or from atom.data_cleaning import StandardCleaner cleaner = StandardCleaner(prohibited_types=['str'], missing_target=True) X, y = cleaner.transform(X, y)","title":"StandardCleaner"},{"location":"API/data_cleaning/standard_cleaner/#standardcleaner","text":"class atom.data_cleaning. StandardCleaner (prohibited_types=[], strip_categorical=True, maximum_cardinality=True, minimum_cardinality=True, missing_target=True, map_target=None, verbose=0, logger=None) [source] Performs standard data cleaning steps on a dataset. Use the parameters to choose which transformations to perform. The available steps are: Remove columns with prohibited data types. Strip categorical features from white spaces. Remove categorical columns with maximal cardinality. Remove columns with minimum cardinality. Remove rows with missing values in the target column. Label-encode the target column. This class is automatically called when initializing atom . Read more in the user guide . Parameters: prohibited_types: str or sequence, optional (default=[]) Columns with any of these types will be removed from the dataset. strip_categorical: bool, optional (default=True) Whether to strip the spaces from values in the categorical columns. maximum_cardinality: bool, optional (default=True) Whether to remove categorical columns with maximum cardinality, i.e. the number of unique values is equal to the number of instances. Usually the case for names, IDs, etc... minimum_cardinality: bool, optional (default=True) Whether to remove columns with minimum cardinality, i.e. all values in the column are the same. missing_target: bool, optional (default=True) Whether to remove rows with missing values in the target column. Ignored if y is not provided. map_target: bool or None, optional (default=None) Whether to map the target column to numerical values. Should only be used for classification tasks. If None, infer task from the provided target column and set to True if it is classification. Ignored if y is not provided or if it already consists of ordered integers. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object.","title":"StandardCleaner"},{"location":"API/data_cleaning/standard_cleaner/#attributes","text":"Attributes: mapping: dict Dictionary of the target values mapped to their respective encoded integer. Only available if map_target was performed.","title":"Attributes"},{"location":"API/data_cleaning/standard_cleaner/#methods","text":"fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y=None) [source] Apply the data cleaning steps on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: StandardCleaner Estimator instance. method transform (X, y=None) [source] Apply the data cleaning steps on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided.","title":"Methods"},{"location":"API/data_cleaning/standard_cleaner/#example","text":"from atom import ATOMClassifier # ATOM's initializer calls StandardCleaner automatically atom = ATOMClassifier(X, y) or from atom.data_cleaning import StandardCleaner cleaner = StandardCleaner(prohibited_types=['str'], missing_target=True) X, y = cleaner.transform(X, y)","title":"Example"},{"location":"API/feature_engineering/feature_generator/","text":"FeatureGenerator class atom.feature_engineering. FeatureGenerator (strategy='DFS', n_features=None, generations=20, population=500, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. This class can be accessed from atom through the feature_generation method. Read more in the user guide . Parameters: strategy: str, optional (default='DFS') Strategy to crate new features. Choose from: 'DFS' to use Deep Feature Synthesis. 'GFG' or 'genetic' to use Genetic Feature Generation. n_features: int or None, optional (default=None) Number of newly generated features to add to the dataset (if strategy='genetic', no more than 1% of the population). If None, select all created. generations: int, optional (default=20) Number of generations to evolve. Only if strategy='genetic'. population: int, optional (default=500) Number of programs in each generation. Only if strategy='genetic'. operators: str, sequence or None, optional (default=None) Name of the operators to be used on the features (for both strategies). None to use all. Valid options are: 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos', 'tan'. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Tip DFS can create many new features and not all of them will be useful. Use FeatureSelector to reduce the number of features! Warning Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom 's missing property. Warning When using DFS with n_jobs>1 , make sure to protect your code with if __name__ == \"__main__\" . Featuretools uses dask , which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task. Attributes Attributes: symbolic_transformer: class Instance used to calculate the genetic features, from SymbolicTransformer . Only if strategy='genetic'. genetic_features: pd.DataFrame Dataframe of the newly created non-linear features. Only if strategy='genetic'. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: FeatureGenerator Fitted instance of self. method fit_transform (X, y) [source] Fit the FeatureGenerator and return the transformed data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Feature set with the newly generated features. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: FeatureGenerator Estimator instance. method transform (X, y=None) [source] Generate new features. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Feature set with the newly generated features. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_generation(strategy='genetic', n_features=3, generations=30, population=400) or from atom.feature_engineering import FeatureGenerator feature_generator = FeatureGenerator(strategy='genetic', n_features=3, generations=30, population=400) feature_generator.fit(X_train, y_train) X = feature_generator.transform(X)","title":"FeatureGenerator"},{"location":"API/feature_engineering/feature_generator/#featuregenerator","text":"class atom.feature_engineering. FeatureGenerator (strategy='DFS', n_features=None, generations=20, population=500, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. This class can be accessed from atom through the feature_generation method. Read more in the user guide . Parameters: strategy: str, optional (default='DFS') Strategy to crate new features. Choose from: 'DFS' to use Deep Feature Synthesis. 'GFG' or 'genetic' to use Genetic Feature Generation. n_features: int or None, optional (default=None) Number of newly generated features to add to the dataset (if strategy='genetic', no more than 1% of the population). If None, select all created. generations: int, optional (default=20) Number of generations to evolve. Only if strategy='genetic'. population: int, optional (default=500) Number of programs in each generation. Only if strategy='genetic'. operators: str, sequence or None, optional (default=None) Name of the operators to be used on the features (for both strategies). None to use all. Valid options are: 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos', 'tan'. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Tip DFS can create many new features and not all of them will be useful. Use FeatureSelector to reduce the number of features! Warning Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom 's missing property. Warning When using DFS with n_jobs>1 , make sure to protect your code with if __name__ == \"__main__\" . Featuretools uses dask , which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task.","title":"FeatureGenerator"},{"location":"API/feature_engineering/feature_generator/#attributes","text":"Attributes: symbolic_transformer: class Instance used to calculate the genetic features, from SymbolicTransformer . Only if strategy='genetic'. genetic_features: pd.DataFrame Dataframe of the newly created non-linear features. Only if strategy='genetic'. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score.","title":"Attributes"},{"location":"API/feature_engineering/feature_generator/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: FeatureGenerator Fitted instance of self. method fit_transform (X, y) [source] Fit the FeatureGenerator and return the transformed data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Feature set with the newly generated features. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: FeatureGenerator Estimator instance. method transform (X, y=None) [source] Generate new features. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Feature set with the newly generated features.","title":"Methods"},{"location":"API/feature_engineering/feature_generator/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_generation(strategy='genetic', n_features=3, generations=30, population=400) or from atom.feature_engineering import FeatureGenerator feature_generator = FeatureGenerator(strategy='genetic', n_features=3, generations=30, population=400) feature_generator.fit(X_train, y_train) X = feature_generator.transform(X)","title":"Example"},{"location":"API/feature_engineering/feature_selector/","text":"FeatureSelector class atom.feature_engineering. FeatureSelector (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Additionally, removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. This class can be accessed from atom through the feature_selection method. Read more in the user guide . Parameters: strategy: string or None, optional (default=None) Feature selection strategy to use. Choose from: None: Do not perform any feature selection algorithm. 'univariate': Select best features according to a univariate F-test. 'PCA': Perform principal component analysis. 'SFM': Select best features according to a model. 'RFE': Perform recursive feature elimination. 'RFECV': Perform RFE with cross-validated selection. solver: string, callable or None, optional (default=None) Solver or model to use for the feature selection strategy. See the sklearn documentation for an extended description of the choices. Select None for the default option per strategy (not applicable for SFM, RFE and RFECV). for 'univariate', choose from: 'f_classif' 'f_regression' 'mutual_info_classif' 'mutual_info_regression' 'chi2' Any function taking two arrays (X, y), and returning arrays (scores, p-values). See the sklearn documentation . for 'PCA', choose from: 'auto' (default) 'full' 'arpack' 'randomized' for 'SFM', 'RFE' and 'RFECV: Estimator with either a feature_importances_ or coef_ attribute after fitting. You can use one of ATOM's pre-defined models . Add _class or _reg after the model's name to specify a classification or regression task, e.g. solver='LGB_reg' (not necessary if called from an atom instance. No default option. n_features: int, float or None, optional (default=None) Number of features to select. Choose from: if None: Select all features. if < 1: Fraction of the total features to select. if >= 1: Number of features to select. If strategy='SFM' and the threshold parameter is not specified, the threshold will be set to -np.inf in order to make this parameter the number of features to select. If strategy='RFECV', it's the minimum number of features to select. max_frac_repeated: float or None, optional (default=1.) Remove features with the same value in at least this fraction of the total rows. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples. None to skip this step. max_correlation: float or None, optional (default=1.) Minimum value of the Pearson correlation coefficient to identify correlated features. A value of 1 removes on of 2 equal columns. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute. None to skip this step. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . **kwargs Any extra keyword argument for the PCA, SFM, RFE or RFECV estimators. See the corresponding sklearn documentation for the available options. Tip Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead. Warning The RFE and RFECV strategies don't work when the solver is a CatBoost model due to incompatibility of the APIs. Attributes Utility attributes Attributes: collinear: pd.DataFrame Dataframe of the removed collinear features. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. feature_importance: list Remaining features ordered by importance. Only if strategy in ['univariate', 'SFM, 'RFE', 'RFECV']. For RFE and RFECV, the importance is extracted from the external estimator fitted on the reduced set. univariate: class SelectKBest instance used to fit the estimator. Only if strategy='univariate'. scaler: class Scaler instance used to scale the data. Only if strategy='PCA' and the data was not already scaled. pca: class PCA instance used to fit the estimator. Only if strategy='PCA'. sfm: class SelectFromModel instance used to fit the estimator. Only if strategy='SFM'. rfe: class RFE instance used to fit the estimator. Only if strategy='RFE'. rfecv: class RFECV instance used to fit the estimator. Only if strategy='RFECV'. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. plot_pca Plot the explained variance ratio vs the number of components. plot_components Plot the explained variance ratio per component. plot_rfecv Plot the scores obtained by the estimator on the RFECV. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Note that the univariate, sfm (when model is not fitted), rfe and rfecv strategies all need a target column. Leaving it None will raise an exception. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: FeatureSelector Fitted instance of self. method fit_transform (X, y) [source] Fit the FeatureSelector and return the transformed feature set. Note that the univariate, sfm (when model is not fitted), rfe and rfecv strategies need a target column. Leaving it None will raise an exception. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method plot_pca (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the explained variance ratio vs the number of components. See plot_pca for a description of the parameters. method plot_components (show=None, title=None, figsize=None, filename=None, display=True) [source] Plot the explained variance ratio per components. See plot_components for a description of the parameters. method plot_rfecv (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the scores obtained by the estimator fitted on every subset of the data. See plot_rfecv for a description of the parameters. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: FeatureSelector Estimator instance. method transform (X, y=None) [source] Transform the feature set. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Transformed feature set. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(stratgey='pca', n_features=12, whiten=True, max_correlation=0.96) atom.plot_pca(filename='pca', figsize=(8, 5)) or from atom.feature_engineering import FeatureSelector feature_selector = FeatureSelector(stratgey='pca', n_features=12, whiten=True, max_correlation=0.96) feature_selector.fit(X_train, y_train) X = feature_selector.transform(X, y) feature_selector.plot_pca(filename='pca', figsize=(8, 5))","title":"FeatureSelector"},{"location":"API/feature_engineering/feature_selector/#featureselector","text":"class atom.feature_engineering. FeatureSelector (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Additionally, removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. This class can be accessed from atom through the feature_selection method. Read more in the user guide . Parameters: strategy: string or None, optional (default=None) Feature selection strategy to use. Choose from: None: Do not perform any feature selection algorithm. 'univariate': Select best features according to a univariate F-test. 'PCA': Perform principal component analysis. 'SFM': Select best features according to a model. 'RFE': Perform recursive feature elimination. 'RFECV': Perform RFE with cross-validated selection. solver: string, callable or None, optional (default=None) Solver or model to use for the feature selection strategy. See the sklearn documentation for an extended description of the choices. Select None for the default option per strategy (not applicable for SFM, RFE and RFECV). for 'univariate', choose from: 'f_classif' 'f_regression' 'mutual_info_classif' 'mutual_info_regression' 'chi2' Any function taking two arrays (X, y), and returning arrays (scores, p-values). See the sklearn documentation . for 'PCA', choose from: 'auto' (default) 'full' 'arpack' 'randomized' for 'SFM', 'RFE' and 'RFECV: Estimator with either a feature_importances_ or coef_ attribute after fitting. You can use one of ATOM's pre-defined models . Add _class or _reg after the model's name to specify a classification or regression task, e.g. solver='LGB_reg' (not necessary if called from an atom instance. No default option. n_features: int, float or None, optional (default=None) Number of features to select. Choose from: if None: Select all features. if < 1: Fraction of the total features to select. if >= 1: Number of features to select. If strategy='SFM' and the threshold parameter is not specified, the threshold will be set to -np.inf in order to make this parameter the number of features to select. If strategy='RFECV', it's the minimum number of features to select. max_frac_repeated: float or None, optional (default=1.) Remove features with the same value in at least this fraction of the total rows. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples. None to skip this step. max_correlation: float or None, optional (default=1.) Minimum value of the Pearson correlation coefficient to identify correlated features. A value of 1 removes on of 2 equal columns. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute. None to skip this step. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . **kwargs Any extra keyword argument for the PCA, SFM, RFE or RFECV estimators. See the corresponding sklearn documentation for the available options. Tip Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead. Warning The RFE and RFECV strategies don't work when the solver is a CatBoost model due to incompatibility of the APIs.","title":"FeatureSelector"},{"location":"API/feature_engineering/feature_selector/#attributes","text":"","title":"Attributes"},{"location":"API/feature_engineering/feature_selector/#utility-attributes","text":"Attributes: collinear: pd.DataFrame Dataframe of the removed collinear features. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. feature_importance: list Remaining features ordered by importance. Only if strategy in ['univariate', 'SFM, 'RFE', 'RFECV']. For RFE and RFECV, the importance is extracted from the external estimator fitted on the reduced set. univariate: class SelectKBest instance used to fit the estimator. Only if strategy='univariate'. scaler: class Scaler instance used to scale the data. Only if strategy='PCA' and the data was not already scaled. pca: class PCA instance used to fit the estimator. Only if strategy='PCA'. sfm: class SelectFromModel instance used to fit the estimator. Only if strategy='SFM'. rfe: class RFE instance used to fit the estimator. Only if strategy='RFE'. rfecv: class RFECV instance used to fit the estimator. Only if strategy='RFECV'.","title":"Utility attributes"},{"location":"API/feature_engineering/feature_selector/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/feature_engineering/feature_selector/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. plot_pca Plot the explained variance ratio vs the number of components. plot_components Plot the explained variance ratio per component. plot_rfecv Plot the scores obtained by the estimator on the RFECV. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Note that the univariate, sfm (when model is not fitted), rfe and rfecv strategies all need a target column. Leaving it None will raise an exception. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: FeatureSelector Fitted instance of self. method fit_transform (X, y) [source] Fit the FeatureSelector and return the transformed feature set. Note that the univariate, sfm (when model is not fitted), rfe and rfecv strategies need a target column. Leaving it None will raise an exception. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method plot_pca (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the explained variance ratio vs the number of components. See plot_pca for a description of the parameters. method plot_components (show=None, title=None, figsize=None, filename=None, display=True) [source] Plot the explained variance ratio per components. See plot_components for a description of the parameters. method plot_rfecv (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the scores obtained by the estimator fitted on every subset of the data. See plot_rfecv for a description of the parameters. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: FeatureSelector Estimator instance. method transform (X, y=None) [source] Transform the feature set. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Transformed feature set.","title":"Methods"},{"location":"API/feature_engineering/feature_selector/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(stratgey='pca', n_features=12, whiten=True, max_correlation=0.96) atom.plot_pca(filename='pca', figsize=(8, 5)) or from atom.feature_engineering import FeatureSelector feature_selector = FeatureSelector(stratgey='pca', n_features=12, whiten=True, max_correlation=0.96) feature_selector.fit(X_train, y_train) X = feature_selector.transform(X, y) feature_selector.plot_pca(filename='pca', figsize=(8, 5))","title":"Example"},{"location":"API/plots/decision_plot/","text":"decision_plot method decision_plot (models=None, index=None, show=None, target=1, title=None, figsize=None, filename=None, display=True, **kwargs) [source] Plot SHAP's decision plot. Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values will be printed in the plot (if supplied). If multiple predictions are plotted together, feature values will not be printed. Plotting too many predictions together will make the plot unintelligible. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default=None) Indices of the rows in the dataset to plot. If tuple (n, m), select rows n until m. If None, select all rows in the test set. show: int or None, optional (default=None) Number of features (ordered by importance) to show in the plot. None to show all. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to the number of features. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's decision_plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.decision_plot(index=(120, 140)) atom.decision_plot(index=120)","title":"decision_plot"},{"location":"API/plots/decision_plot/#decision_plot","text":"method decision_plot (models=None, index=None, show=None, target=1, title=None, figsize=None, filename=None, display=True, **kwargs) [source] Plot SHAP's decision plot. Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values will be printed in the plot (if supplied). If multiple predictions are plotted together, feature values will not be printed. Plotting too many predictions together will make the plot unintelligible. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default=None) Indices of the rows in the dataset to plot. If tuple (n, m), select rows n until m. If None, select all rows in the test set. show: int or None, optional (default=None) Number of features (ordered by importance) to show in the plot. None to show all. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to the number of features. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's decision_plot.","title":"decision_plot"},{"location":"API/plots/decision_plot/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.decision_plot(index=(120, 140)) atom.decision_plot(index=120)","title":"Example"},{"location":"API/plots/dependence_plot/","text":"dependence_plot method dependence_plot (models=None, index='rank(1)', target=1, title=None, figsize=(10, 6), filename=None, display=True, **kwargs) [source] Plot SHAP's dependence plot. Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of the classical partial dependence plots. Vertical dispersion of the data points represents interaction effects. Grey ticks along the y-axis are data points where the feature's value was NaN. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default='rank(1)') If this is an int, it is the index of the feature to plot. If this is a string it is either the name of the feature to plot, or it can have the form 'rank(int)' to specify the feature with that rank (ordered by mean absolute SHAP value over all the samples). target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's dependence_plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.dependence_plot(index='rank(3)')","title":"dependence_plot"},{"location":"API/plots/dependence_plot/#dependence_plot","text":"method dependence_plot (models=None, index='rank(1)', target=1, title=None, figsize=(10, 6), filename=None, display=True, **kwargs) [source] Plot SHAP's dependence plot. Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of the classical partial dependence plots. Vertical dispersion of the data points represents interaction effects. Grey ticks along the y-axis are data points where the feature's value was NaN. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default='rank(1)') If this is an int, it is the index of the feature to plot. If this is a string it is either the name of the feature to plot, or it can have the form 'rank(int)' to specify the feature with that rank (ordered by mean absolute SHAP value over all the samples). target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's dependence_plot.","title":"dependence_plot"},{"location":"API/plots/dependence_plot/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.dependence_plot(index='rank(3)')","title":"Example"},{"location":"API/plots/force_plot/","text":"force_plot method force_plot (models=None, index=None, target=1, title=None, figsize=(14, 6), filename=None, display=True, **kwargs) [source] Plot SHAP's force plot. Visualize the given SHAP values with an additive force layout. The explainer will be chosen automatically based on the model's type. Note that by default this plot will render using javascript. For a regular figure use matplotlib=True (this option is only available when only 1 row is selected through the index parameter). Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default=None) Indices of the rows in the dataset to plot. If tuple (n, m), select rows n until m. If None, select all rows in the test set. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(14, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If matplotlib=False, the figure will be saved as an html file. If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's force_plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run('lr') atom.force_plot(index=atom.X_test.index[0], matplotlib=True, filename='force_plot')","title":"force_plot"},{"location":"API/plots/force_plot/#force_plot","text":"method force_plot (models=None, index=None, target=1, title=None, figsize=(14, 6), filename=None, display=True, **kwargs) [source] Plot SHAP's force plot. Visualize the given SHAP values with an additive force layout. The explainer will be chosen automatically based on the model's type. Note that by default this plot will render using javascript. For a regular figure use matplotlib=True (this option is only available when only 1 row is selected through the index parameter). Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default=None) Indices of the rows in the dataset to plot. If tuple (n, m), select rows n until m. If None, select all rows in the test set. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(14, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If matplotlib=False, the figure will be saved as an html file. If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's force_plot.","title":"force_plot"},{"location":"API/plots/force_plot/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run('lr') atom.force_plot(index=atom.X_test.index[0], matplotlib=True, filename='force_plot')","title":"Example"},{"location":"API/plots/plot_bagging/","text":"plot_bagging method plot_bagging (models=None, metric=0, title=None, figsize=None, filename=None, display=True) [source] Plot a boxplot of the bagging's results. Only available for models fitted using bagging . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline that used bagging are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=None) Figure's size, format as (x, y). If None, adapts size the to number of models. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'Tree', 'LGB', 'MLP'], metric='accuracy', bagging=5) atom.plot_bagging()","title":"plot_bagging"},{"location":"API/plots/plot_bagging/#plot_bagging","text":"method plot_bagging (models=None, metric=0, title=None, figsize=None, filename=None, display=True) [source] Plot a boxplot of the bagging's results. Only available for models fitted using bagging . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline that used bagging are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=None) Figure's size, format as (x, y). If None, adapts size the to number of models. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_bagging"},{"location":"API/plots/plot_bagging/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'Tree', 'LGB', 'MLP'], metric='accuracy', bagging=5) atom.plot_bagging()","title":"Example"},{"location":"API/plots/plot_bo/","text":"plot_bo method plot_bo (models=None, metric=0, title=None, figsize=(10, 8), filename=None, display=True) [source] Plot the bayesian optimization scoring. Only for models that ran the hyperparameter optimization. This is the same plot as the one produced by bo_params={'plot_bo': True} while running the optimization. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline that used bayesian optimization are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 8)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LDA', 'LGB'], metric='f1', n_calls=24, n_initial_points=10) atom.plot_bo()","title":"plot_bo"},{"location":"API/plots/plot_bo/#plot_bo","text":"method plot_bo (models=None, metric=0, title=None, figsize=(10, 8), filename=None, display=True) [source] Plot the bayesian optimization scoring. Only for models that ran the hyperparameter optimization. This is the same plot as the one produced by bo_params={'plot_bo': True} while running the optimization. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline that used bayesian optimization are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 8)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_bo"},{"location":"API/plots/plot_bo/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LDA', 'LGB'], metric='f1', n_calls=24, n_initial_points=10) atom.plot_bo()","title":"Example"},{"location":"API/plots/plot_calibration/","text":"plot_calibration method plot_calibration (models=None, n_bins=10, title=None, figsize=(10, 10), filename=None, display=True) [source] Plot the calibration curve for a binary classifier. Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level. For instance a well calibrated (binary) classifier should classify the samples such that among the samples to which it gave a predict_proba value close to 0.8, approx. 80% actually belong to the positive class. Read more in sklearn's documentation . This figure shows two plots: the calibration curve, where the x-axis represents the average predicted probability in each bin and the y-axis is the fraction of positives, i.e. the proportion of samples whose class is the positive class (in each bin); and a distribution of all predicted probabilities of the classifier. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. n_bins: int, optional (default=10) Number of bins for the calibration calculation and the histogram. Minimum of 5 required. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 10)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X) atom.run(['GNB', 'LR', 'LGB'], metric='average_precision') atom.plot_calibration()","title":"plot_calibration"},{"location":"API/plots/plot_calibration/#plot_calibration","text":"method plot_calibration (models=None, n_bins=10, title=None, figsize=(10, 10), filename=None, display=True) [source] Plot the calibration curve for a binary classifier. Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level. For instance a well calibrated (binary) classifier should classify the samples such that among the samples to which it gave a predict_proba value close to 0.8, approx. 80% actually belong to the positive class. Read more in sklearn's documentation . This figure shows two plots: the calibration curve, where the x-axis represents the average predicted probability in each bin and the y-axis is the fraction of positives, i.e. the proportion of samples whose class is the positive class (in each bin); and a distribution of all predicted probabilities of the classifier. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. n_bins: int, optional (default=10) Number of bins for the calibration calculation and the histogram. Minimum of 5 required. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 10)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_calibration"},{"location":"API/plots/plot_calibration/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X) atom.run(['GNB', 'LR', 'LGB'], metric='average_precision') atom.plot_calibration()","title":"Example"},{"location":"API/plots/plot_components/","text":"plot_components method plot_components (show=None, title=None, figsize=None, filename=None, display=True) [source] Plot the explained variance ratio per components. Only available if PCA was applied on the data. Parameters: show: int or None, optional (default=None) Number of components to show. If None, the number of components in the data are plotted. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=11) atom.plot_components()","title":"plot_components"},{"location":"API/plots/plot_components/#plot_components","text":"method plot_components (show=None, title=None, figsize=None, filename=None, display=True) [source] Plot the explained variance ratio per components. Only available if PCA was applied on the data. Parameters: show: int or None, optional (default=None) Number of components to show. If None, the number of components in the data are plotted. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_components"},{"location":"API/plots/plot_components/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=11) atom.plot_components()","title":"Example"},{"location":"API/plots/plot_confusion_matrix/","text":"plot_confusion_matrix method plot_confusion_matrix (models=None, dataset='test', normalize=False, title=None, figsize=None, filename=None, display=True) [source] Plot a model's confusion matrix. Only for classification tasks. For 1 model: plot the confusion matrix in a heatmap. For multiple models: compare TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the confusion matrix. Options are 'train' or 'test'. normalize: bool, optional (default=False) Whether to normalize the matrix. Only for the heatmap plot. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=None) Figure's size, format as (x, y). If None, adapts size to plot type. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'Bag']) atom.Tree.plot_confusion_matrix(normalize=True) atom.plot_confusion_matrix()","title":"plot_confusion_matrix"},{"location":"API/plots/plot_confusion_matrix/#plot_confusion_matrix","text":"method plot_confusion_matrix (models=None, dataset='test', normalize=False, title=None, figsize=None, filename=None, display=True) [source] Plot a model's confusion matrix. Only for classification tasks. For 1 model: plot the confusion matrix in a heatmap. For multiple models: compare TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the confusion matrix. Options are 'train' or 'test'. normalize: bool, optional (default=False) Whether to normalize the matrix. Only for the heatmap plot. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=None) Figure's size, format as (x, y). If None, adapts size to plot type. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_confusion_matrix"},{"location":"API/plots/plot_confusion_matrix/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'Bag']) atom.Tree.plot_confusion_matrix(normalize=True) atom.plot_confusion_matrix()","title":"Example"},{"location":"API/plots/plot_correlation/","text":"plot_correlation method plot_correlation (method='pearson', title=None, figsize=(8, 8), filename=None, display=True) [source] Plot the data's correlation matrix. Ignores non-numeric columns. Parameters: method: str, optional (default='pearson') Method of correlation. Choose from 'pearson', 'kendall' or 'spearman'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(8, 8)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y='RainTomorrow') atom.plot_correlation()","title":"plot_correlation"},{"location":"API/plots/plot_correlation/#plot_correlation","text":"method plot_correlation (method='pearson', title=None, figsize=(8, 8), filename=None, display=True) [source] Plot the data's correlation matrix. Ignores non-numeric columns. Parameters: method: str, optional (default='pearson') Method of correlation. Choose from 'pearson', 'kendall' or 'spearman'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(8, 8)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_correlation"},{"location":"API/plots/plot_correlation/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y='RainTomorrow') atom.plot_correlation()","title":"Example"},{"location":"API/plots/plot_errors/","text":"plot_errors method plot_errors (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot a model's prediction errors, i.e. the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This pot can be useful to detect noise or heteroscedasticity along a range of the target domain. Only for regression tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the errors. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['OLS', 'LGB'], metric='MAE') atom.plot_errors()","title":"plot_errors"},{"location":"API/plots/plot_errors/#plot_errors","text":"method plot_errors (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot a model's prediction errors, i.e. the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This pot can be useful to detect noise or heteroscedasticity along a range of the target domain. Only for regression tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the errors. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_errors"},{"location":"API/plots/plot_errors/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['OLS', 'LGB'], metric='MAE') atom.plot_errors()","title":"Example"},{"location":"API/plots/plot_evals/","text":"plot_evals method plot_evals (models=None, dataset='both', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot evaluation curves for the train and test set. Only for models that allow in-training evaluation (XGB, LGB, CatB). The metric is provided by the model's package and is different for every model and every task. For this reason, the method only allows plotting one model at a time. Parameters: models: str, sequence or None, optional (default=None) Name of the model to plot. If None, all models in the pipeline are selected. Note that leaving the default option could raise an exception if there are multiple models in the pipeline. To avoid this, call the plot from a model , e.g. atom.lgb.plot_evals() . dataset: str, optional (default='both') Data set on which to calculate the evaluation curves. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['Bag', 'LGB']) atom.lgb.plot_evals()","title":"plot_evals"},{"location":"API/plots/plot_evals/#plot_evals","text":"method plot_evals (models=None, dataset='both', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot evaluation curves for the train and test set. Only for models that allow in-training evaluation (XGB, LGB, CatB). The metric is provided by the model's package and is different for every model and every task. For this reason, the method only allows plotting one model at a time. Parameters: models: str, sequence or None, optional (default=None) Name of the model to plot. If None, all models in the pipeline are selected. Note that leaving the default option could raise an exception if there are multiple models in the pipeline. To avoid this, call the plot from a model , e.g. atom.lgb.plot_evals() . dataset: str, optional (default='both') Data set on which to calculate the evaluation curves. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_evals"},{"location":"API/plots/plot_evals/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['Bag', 'LGB']) atom.lgb.plot_evals()","title":"Example"},{"location":"API/plots/plot_feature_importance/","text":"plot_feature_importance method plot_feature_importance (models=None, show=None, title=None, figsize=None, filename=None, display=True) [source] Plot a tree-based model's feature importance. The importances are normalized in order to be able to compare them between models. The feature_importance attribute is updated with the extracted importance ranking. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. show: int, optional (default=None) Number of best features to show in the plot. None to show all. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF'], metric='recall_weighted') atom.RF.plot_feature_importance(show=11, filename='random_forest_importance.png')","title":"plot_feature_importance"},{"location":"API/plots/plot_feature_importance/#plot_feature_importance","text":"method plot_feature_importance (models=None, show=None, title=None, figsize=None, filename=None, display=True) [source] Plot a tree-based model's feature importance. The importances are normalized in order to be able to compare them between models. The feature_importance attribute is updated with the extracted importance ranking. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. show: int, optional (default=None) Number of best features to show in the plot. None to show all. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_feature_importance"},{"location":"API/plots/plot_feature_importance/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF'], metric='recall_weighted') atom.RF.plot_feature_importance(show=11, filename='random_forest_importance.png')","title":"Example"},{"location":"API/plots/plot_gains/","text":"plot_gains method plot_gains (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the cumulative gains curve. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the gains curve. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['GNB', 'RF', 'LGB'], metric='roc_auc') atom.plot_gains(filename='cumulative_gains_curve.png')","title":"plot_gains"},{"location":"API/plots/plot_gains/#plot_gains","text":"method plot_gains (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the cumulative gains curve. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the gains curve. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_gains"},{"location":"API/plots/plot_gains/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['GNB', 'RF', 'LGB'], metric='roc_auc') atom.plot_gains(filename='cumulative_gains_curve.png')","title":"Example"},{"location":"API/plots/plot_learning_curve/","text":"plot_learning_curve method plot_learning_curve (models=None, metric=0, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the model's learning curve: score vs number of training samples. Only available if the models were fitted using train sizing . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example import numpy as np from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.train_sizing(['GNB', 'LDA'], metric='accuracy', train_sizes=np.linspace(0.1, 1.0, 9), bagging=5) atom.plot_learning_curve()","title":"plot_learning_curve"},{"location":"API/plots/plot_learning_curve/#plot_learning_curve","text":"method plot_learning_curve (models=None, metric=0, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the model's learning curve: score vs number of training samples. Only available if the models were fitted using train sizing . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_learning_curve"},{"location":"API/plots/plot_learning_curve/#example","text":"import numpy as np from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.train_sizing(['GNB', 'LDA'], metric='accuracy', train_sizes=np.linspace(0.1, 1.0, 9), bagging=5) atom.plot_learning_curve()","title":"Example"},{"location":"API/plots/plot_lift/","text":"plot_lift method plot_lift (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the lift curve. Only for binary classification. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the lift curve. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['GNB', 'RF', 'LGB'], metric='roc_auc') atom.plot_lift(filename='lift_curve.png')","title":"plot_lift"},{"location":"API/plots/plot_lift/#plot_lift","text":"method plot_lift (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the lift curve. Only for binary classification. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the lift curve. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_lift"},{"location":"API/plots/plot_lift/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['GNB', 'RF', 'LGB'], metric='roc_auc') atom.plot_lift(filename='lift_curve.png')","title":"Example"},{"location":"API/plots/plot_partial_dependence/","text":"plot_partial_dependence method plot_partial_dependence (models=None, features=None, target=None, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the partial dependence of features. The partial dependence of a feature (or a set of features) corresponds to the average response of the model for each possible value of the feature. Two-way partial dependence plots are plotted as contour plots (only allowed for single model plots). The deciles of the feature values will be shown with tick marks on the x-axes for one-way plots, and on both axes for two-way plots. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. features: int, str, sequence or None, optional (default=None) Features or feature pairs (name or index) to get the partial dependence from. Maximum of 3 allowed. If None, it uses the top 3 features if feature_importance is defined (see plot_feature_importance or plot_permutation_importance ), else it uses the first 3 features in the dataset. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=6) atom.run(['Tree', 'Bag'], metric='precision') atom.Tree.plot_partial_dependence(features=[0, 1, (1, 3)]) atom.plot_partial_dependence()","title":"plot_partial_dependence"},{"location":"API/plots/plot_partial_dependence/#plot_partial_dependence","text":"method plot_partial_dependence (models=None, features=None, target=None, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the partial dependence of features. The partial dependence of a feature (or a set of features) corresponds to the average response of the model for each possible value of the feature. Two-way partial dependence plots are plotted as contour plots (only allowed for single model plots). The deciles of the feature values will be shown with tick marks on the x-axes for one-way plots, and on both axes for two-way plots. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. features: int, str, sequence or None, optional (default=None) Features or feature pairs (name or index) to get the partial dependence from. Maximum of 3 allowed. If None, it uses the top 3 features if feature_importance is defined (see plot_feature_importance or plot_permutation_importance ), else it uses the first 3 features in the dataset. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_partial_dependence"},{"location":"API/plots/plot_partial_dependence/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=6) atom.run(['Tree', 'Bag'], metric='precision') atom.Tree.plot_partial_dependence(features=[0, 1, (1, 3)]) atom.plot_partial_dependence()","title":"Example"},{"location":"API/plots/plot_pca/","text":"plot_pca method plot_pca (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the explained variance ratio vs the number of components. Only available if PCA was applied on the data. Parameters: title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=11) atom.plot_pca()","title":"plot_pca"},{"location":"API/plots/plot_pca/#plot_pca","text":"method plot_pca (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the explained variance ratio vs the number of components. Only available if PCA was applied on the data. Parameters: title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_pca"},{"location":"API/plots/plot_pca/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=11) atom.plot_pca()","title":"Example"},{"location":"API/plots/plot_permutation_importance/","text":"plot_permutation_importance method plot_permutation_importance (models=None, show=None, n_repeats=10, title=None, figsize=None, filename=None, display=True) [source] Plot the feature permutation importance of models. Calculating all permutations can be time consuming, especially if n_repeats is high. They are stored under the attribute permutations . This means that if a plot is repeated for the same model with the same n_repeats , it will be considerably faster. The feature_importance attribute is updated with the extracted importance ranking. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. show: int, optional (default=None) Number of best features to show in the plot. None to show all. n_repeats: int, optional (default=10) Number of times to permute each feature. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'LDA'], metric='average_precision') atom.LDA.plot_permutation_importance(show=10, n_repeats=7)","title":"plot_permutation_importance"},{"location":"API/plots/plot_permutation_importance/#plot_permutation_importance","text":"method plot_permutation_importance (models=None, show=None, n_repeats=10, title=None, figsize=None, filename=None, display=True) [source] Plot the feature permutation importance of models. Calculating all permutations can be time consuming, especially if n_repeats is high. They are stored under the attribute permutations . This means that if a plot is repeated for the same model with the same n_repeats , it will be considerably faster. The feature_importance attribute is updated with the extracted importance ranking. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. show: int, optional (default=None) Number of best features to show in the plot. None to show all. n_repeats: int, optional (default=10) Number of times to permute each feature. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_permutation_importance"},{"location":"API/plots/plot_permutation_importance/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'LDA'], metric='average_precision') atom.LDA.plot_permutation_importance(show=10, n_repeats=7)","title":"Example"},{"location":"API/plots/plot_pipeline/","text":"plot_pipeline method plot_pipeline (show_params=True, title=None, figsize=None, filename=None, display=True) [source] Plot a diagram of every estimator in atom 's pipeline. Parameters: show_params: bool, optional (default=True) Whether to show the parameters used for every estimator. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to the length of the pipeline. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='median', strat_cat='drop', min_frac_rows=0.8) atom.encode(strategy='LeaveOneOut', max_onehot=8, frac_to_other=0.02) atom.outliers(strategy='drop', max_sigma=4, include_target=False) atom.feature_selection(strategy='PCA', n_features=10, max_frac_repeated=1., max_correlation=0.7) atom.run(['GBM', 'LGB'], metric='recall_weighted', n_calls=(10, 20), n_initial_points=(5, 12), bo_params={'base_estimator': 'RF', 'cv': 1, 'max_time': 1000}, bagging=4) atom.plot_pipeline()","title":"plot_pipeline"},{"location":"API/plots/plot_pipeline/#plot_pipeline","text":"method plot_pipeline (show_params=True, title=None, figsize=None, filename=None, display=True) [source] Plot a diagram of every estimator in atom 's pipeline. Parameters: show_params: bool, optional (default=True) Whether to show the parameters used for every estimator. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to the length of the pipeline. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_pipeline"},{"location":"API/plots/plot_pipeline/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='median', strat_cat='drop', min_frac_rows=0.8) atom.encode(strategy='LeaveOneOut', max_onehot=8, frac_to_other=0.02) atom.outliers(strategy='drop', max_sigma=4, include_target=False) atom.feature_selection(strategy='PCA', n_features=10, max_frac_repeated=1., max_correlation=0.7) atom.run(['GBM', 'LGB'], metric='recall_weighted', n_calls=(10, 20), n_initial_points=(5, 12), bo_params={'base_estimator': 'RF', 'cv': 1, 'max_time': 1000}, bagging=4) atom.plot_pipeline()","title":"Example"},{"location":"API/plots/plot_prc/","text":"plot_prc method plot_prc (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the precision-recall curve. The legend shows the average precision (AP) score. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF', 'LGB'], metric='average_precision') atom.plot_prc()","title":"plot_prc"},{"location":"API/plots/plot_prc/#plot_prc","text":"method plot_prc (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the precision-recall curve. The legend shows the average precision (AP) score. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_prc"},{"location":"API/plots/plot_prc/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF', 'LGB'], metric='average_precision') atom.plot_prc()","title":"Example"},{"location":"API/plots/plot_probabilities/","text":"plot_probabilities method plot_probabilities (models=None, dataset='test', target=1, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the probability distribution of the categories in the target column. Only for classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. target: int or str, optional (default=1) Probability of being that category in the target column as index or name. Only for multiclass classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y='RainTomorrow') atom.run('rf') atom.plot_probabilities(target='Yes', filenmae='probabilities_category_yes')","title":"plot_probabilities"},{"location":"API/plots/plot_probabilities/#plot_probabilities","text":"method plot_probabilities (models=None, dataset='test', target=1, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the probability distribution of the categories in the target column. Only for classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. target: int or str, optional (default=1) Probability of being that category in the target column as index or name. Only for multiclass classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_probabilities"},{"location":"API/plots/plot_probabilities/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y='RainTomorrow') atom.run('rf') atom.plot_probabilities(target='Yes', filenmae='probabilities_category_yes')","title":"Example"},{"location":"API/plots/plot_residuals/","text":"plot_residuals method plot_residuals (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line shows the identity line. This plot can be useful to analyze the variance of the error of the regressor. If the points are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. Only for regression tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['OLS', 'LGB'], metric='MAE') atom.plot_residuals()","title":"plot_residuals"},{"location":"API/plots/plot_residuals/#plot_residuals","text":"method plot_residuals (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line shows the identity line. This plot can be useful to analyze the variance of the error of the regressor. If the points are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. Only for regression tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_residuals"},{"location":"API/plots/plot_residuals/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['OLS', 'LGB'], metric='MAE') atom.plot_residuals()","title":"Example"},{"location":"API/plots/plot_rfecv/","text":"plot_rfecv method plot_rfecv (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the RFECV results, i.e. the scores obtained by the estimator fitted on every subset of the dataset. Only available if RFECV was applied on the data. Parameters: title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='RFECV', solver='LGB', scoring='precision') atom.plot_rfecv()","title":"plot_rfecv"},{"location":"API/plots/plot_rfecv/#plot_rfecv","text":"method plot_rfecv (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the RFECV results, i.e. the scores obtained by the estimator fitted on every subset of the dataset. Only available if RFECV was applied on the data. Parameters: title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_rfecv"},{"location":"API/plots/plot_rfecv/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='RFECV', solver='LGB', scoring='precision') atom.plot_rfecv()","title":"Example"},{"location":"API/plots/plot_roc/","text":"plot_roc method plot_roc (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the Receiver Operating Characteristics curve. The legend shows the Area Under the ROC Curve (AUC) score. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF', 'LGB'], metric='roc_auc') atom.plot_roc(filename='roc_curve.png')","title":"plot_roc"},{"location":"API/plots/plot_roc/#plot_roc","text":"method plot_roc (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the Receiver Operating Characteristics curve. The legend shows the Area Under the ROC Curve (AUC) score. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_roc"},{"location":"API/plots/plot_roc/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF', 'LGB'], metric='roc_auc') atom.plot_roc(filename='roc_curve.png')","title":"Example"},{"location":"API/plots/plot_successive_halving/","text":"plot_successive_halving method plot_successive_halving (models=None, metric=0, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot of the models' scores per iteration of the successive halving. Only available if the models were fitted using successive halving . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.successive_halving(['tree', 'bag', 'adab', 'et', 'rf', 'gbm', 'xgb', 'lgb'], metric='mse') atom.plot_successive_halving()","title":"plot_successive_halving"},{"location":"API/plots/plot_successive_halving/#plot_successive_halving","text":"method plot_successive_halving (models=None, metric=0, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot of the models' scores per iteration of the successive halving. Only available if the models were fitted using successive halving . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_successive_halving"},{"location":"API/plots/plot_successive_halving/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.successive_halving(['tree', 'bag', 'adab', 'et', 'rf', 'gbm', 'xgb', 'lgb'], metric='mse') atom.plot_successive_halving()","title":"Example"},{"location":"API/plots/plot_threshold/","text":"plot_threshold method plot_threshold (models=None, metric=None, dataset='test', steps=100, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot a metric's performance against threshold values. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. metric: str, callable, sequence or None, optional (default=None) Metric(s) to plot. These can be one of sklearn's pre-defined scorers, a metric function or a sklearn scorer object (see the user guide ). If None, the metric used to run the pipeline is used. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. steps: int, optional (default=100) Number of thresholds measured. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier from sklearn.metrics import recall_score atom = ATOMClassifier(X, y) atom.run('LGB') atom.plot_threshold(metric=['accuracy', 'f1', recall_score])","title":"plot_threshold"},{"location":"API/plots/plot_threshold/#plot_threshold","text":"method plot_threshold (models=None, metric=None, dataset='test', steps=100, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot a metric's performance against threshold values. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. metric: str, callable, sequence or None, optional (default=None) Metric(s) to plot. These can be one of sklearn's pre-defined scorers, a metric function or a sklearn scorer object (see the user guide ). If None, the metric used to run the pipeline is used. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. steps: int, optional (default=100) Number of thresholds measured. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_threshold"},{"location":"API/plots/plot_threshold/#example","text":"from atom import ATOMClassifier from sklearn.metrics import recall_score atom = ATOMClassifier(X, y) atom.run('LGB') atom.plot_threshold(metric=['accuracy', 'f1', recall_score])","title":"Example"},{"location":"API/plots/summary_plot/","text":"summary_plot method summary_plot (models=None, show=None, target=1, title=None, figsize=None, filename=None, display=True, **kwargs) [source] Plot SHAP's summary plot. Create a SHAP beeswarm plot, colored by feature values when they are provided. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . show: int or None, optional (default=None) Number of features to show in the plot. None to show all. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's summary_plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.summary_plot(show=11)","title":"summary_plot"},{"location":"API/plots/summary_plot/#summary_plot","text":"method summary_plot (models=None, show=None, target=1, title=None, figsize=None, filename=None, display=True, **kwargs) [source] Plot SHAP's summary plot. Create a SHAP beeswarm plot, colored by feature values when they are provided. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . show: int or None, optional (default=None) Number of features to show in the plot. None to show all. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's summary_plot.","title":"summary_plot"},{"location":"API/plots/summary_plot/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.summary_plot(show=11)","title":"Example"},{"location":"API/predicting/decision_function/","text":"decision_function method decision_function (X, verbose=None, **kwargs) [source] Transform the data and evaluate the decision function on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a decision_function method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run('kSVM', metric='accuracy') # Evaluate the decision function on new data predictions = atom.ksvm.decision_function(X_new)","title":"decision_function"},{"location":"API/predicting/decision_function/#decision_function","text":"method decision_function (X, verbose=None, **kwargs) [source] Transform the data and evaluate the decision function on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a decision_function method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations.","title":"decision_function"},{"location":"API/predicting/decision_function/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run('kSVM', metric='accuracy') # Evaluate the decision function on new data predictions = atom.ksvm.decision_function(X_new)","title":"Example"},{"location":"API/predicting/predict/","text":"predict method predict (X, verbose=None, **kwargs) [source] Transform the data and make predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict(X_new)","title":"predict"},{"location":"API/predicting/predict/#predict","text":"method predict (X, verbose=None, **kwargs) [source] Transform the data and make predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations.","title":"predict"},{"location":"API/predicting/predict/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict(X_new)","title":"Example"},{"location":"API/predicting/predict_log_proba/","text":"predict_log_proba method predict_log_proba (X, verbose=None, **kwargs) [source] Transform the data and make logarithmic probability predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict_proba method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict_log_proba(X_new)","title":"predict_log_proba"},{"location":"API/predicting/predict_log_proba/#predict_log_proba","text":"method predict_log_proba (X, verbose=None, **kwargs) [source] Transform the data and make logarithmic probability predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict_proba method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations.","title":"predict_log_proba"},{"location":"API/predicting/predict_log_proba/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict_log_proba(X_new)","title":"Example"},{"location":"API/predicting/predict_proba/","text":"predict_proba method predict_proba (X, verbose=None, **kwargs) [source] Transform the data and make probabilistic predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict_proba method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict_proba(X_new)","title":"predict_proba"},{"location":"API/predicting/predict_proba/#predict_proba","text":"method predict_proba (X, verbose=None, **kwargs) [source] Transform the data and make probabilistic predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict_proba method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations.","title":"predict_proba"},{"location":"API/predicting/predict_proba/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict_proba(X_new)","title":"Example"},{"location":"API/predicting/score/","text":"score method score (X, y, verbose=None, **kwargs) [source] Transform the data and return the model's score on new data. The score is a default evaluation criterion for the problem the estimator is designed to solve, defined by the estimator's package. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a score method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Position of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Note The returned metric is determined by each estimator's score method pre-defined by its respective package. See its corresponding documentation for further details. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['MNB', 'KNN', 'kSVM'], metric='precision') # Get the mean accuracy on new data predictions = atom.kSVM.score(X_new, y_new)","title":"score"},{"location":"API/predicting/score/#score","text":"method score (X, y, verbose=None, **kwargs) [source] Transform the data and return the model's score on new data. The score is a default evaluation criterion for the problem the estimator is designed to solve, defined by the estimator's package. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a score method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Position of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Note The returned metric is determined by each estimator's score method pre-defined by its respective package. See its corresponding documentation for further details.","title":"score"},{"location":"API/predicting/score/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['MNB', 'KNN', 'kSVM'], metric='precision') # Get the mean accuracy on new data predictions = atom.kSVM.score(X_new, y_new)","title":"Example"},{"location":"API/predicting/transform/","text":"transform method transform (X, y=None, verbose=None, **kwargs) [source] Transform new data through all the pre-processing steps in the pipeline. By default, all transformers are included except outliers and balance since they should only be applied on the training set. Can only be called from atom . Parameters: X: dict, sequence, np.array or pd.DataFrame Features to transform, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformers. If int: Position of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Additional keyword arguments to customize which transformers to apply. You can either select them including their index in the pipeline parameter, e.g. pipeline=[0, 1, 4] or include/exclude them individually using their methods, e.g. impute=True or feature_selection=False . Note When using the pipeline parameter to include/exclude transformers, remember that the first transformer (index 0) in atom 's pipeline is always the StandardCleaner called during initialization. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='knn', strat_cat='drop') atom.outliers(strategy='min_max', max_sigma=2) atom.feature_generation(strategy='gfg', n_features=3, generations=10, population=1000) # Apply only the StandardCleaner and Imputer on new data X_transformed = atom.transform(X_new, pipeline=[0, 1])","title":"transform"},{"location":"API/predicting/transform/#transform","text":"method transform (X, y=None, verbose=None, **kwargs) [source] Transform new data through all the pre-processing steps in the pipeline. By default, all transformers are included except outliers and balance since they should only be applied on the training set. Can only be called from atom . Parameters: X: dict, sequence, np.array or pd.DataFrame Features to transform, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformers. If int: Position of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Additional keyword arguments to customize which transformers to apply. You can either select them including their index in the pipeline parameter, e.g. pipeline=[0, 1, 4] or include/exclude them individually using their methods, e.g. impute=True or feature_selection=False . Note When using the pipeline parameter to include/exclude transformers, remember that the first transformer (index 0) in atom 's pipeline is always the StandardCleaner called during initialization.","title":"transform"},{"location":"API/predicting/transform/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='knn', strat_cat='drop') atom.outliers(strategy='min_max', max_sigma=2) atom.feature_generation(strategy='gfg', n_features=3, generations=10, population=1000) # Apply only the StandardCleaner and Imputer on new data X_transformed = atom.transform(X_new, pipeline=[0, 1])","title":"Example"},{"location":"API/training/successivehalvingclassifier/","text":"SuccessiveHalvingClassifier class atom.training. SuccessiveHalvingClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a successive halving fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the complete training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the SuccessiveHalvingClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. skip_iter: int, optional (default=0) Skip last skip_iter iterations of the successive halving. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use default name (SuccessiveHalvingClassifier). save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: SuccessiveHalvingClassifier Estimator instance. Example from atom.training import SuccessiveHalvingClassifier # Run the pipeline trainer = SuccessiveHalvingClassifier(['Tree', 'Bag', 'RF', 'ET'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_successive_halving()","title":"SuccessiveHalvingClassifier"},{"location":"API/training/successivehalvingclassifier/#successivehalvingclassifier","text":"class atom.training. SuccessiveHalvingClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a successive halving fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the complete training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the SuccessiveHalvingClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. skip_iter: int, optional (default=0) Skip last skip_iter iterations of the successive halving. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"SuccessiveHalvingClassifier"},{"location":"API/training/successivehalvingclassifier/#attributes","text":"","title":"Attributes"},{"location":"API/training/successivehalvingclassifier/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column.","title":"Data attributes"},{"location":"API/training/successivehalvingclassifier/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/successivehalvingclassifier/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/successivehalvingclassifier/#methods","text":"calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use default name (SuccessiveHalvingClassifier). save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: SuccessiveHalvingClassifier Estimator instance.","title":"Methods"},{"location":"API/training/successivehalvingclassifier/#example","text":"from atom.training import SuccessiveHalvingClassifier # Run the pipeline trainer = SuccessiveHalvingClassifier(['Tree', 'Bag', 'RF', 'ET'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_successive_halving()","title":"Example"},{"location":"API/training/successivehalvingregressor/","text":"SuccessiveHalvingRegressor class atom.training. SuccessiveHalvingRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a successive halving fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the complete training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the SuccessiveHalvingRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. skip_iter: int, optional (default=0) Skip last skip_iter iterations of the successive halving. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance. Example from atom.training import SuccessiveHalvingRegressor # Run the pipeline trainer = SuccessiveHalvingRegressor(['Tree', 'Bag', 'RF', 'ET'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_successive_halving()","title":"SuccessiveHalvingClassifier"},{"location":"API/training/successivehalvingregressor/#successivehalvingregressor","text":"class atom.training. SuccessiveHalvingRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a successive halving fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the complete training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the SuccessiveHalvingRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. skip_iter: int, optional (default=0) Skip last skip_iter iterations of the successive halving. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"SuccessiveHalvingRegressor"},{"location":"API/training/successivehalvingregressor/#attributes","text":"","title":"Attributes"},{"location":"API/training/successivehalvingregressor/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column.","title":"Data attributes"},{"location":"API/training/successivehalvingregressor/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/successivehalvingregressor/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/successivehalvingregressor/#methods","text":"clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance.","title":"Methods"},{"location":"API/training/successivehalvingregressor/#example","text":"from atom.training import SuccessiveHalvingRegressor # Run the pipeline trainer = SuccessiveHalvingRegressor(['Tree', 'Bag', 'RF', 'ET'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_successive_halving()","title":"Example"},{"location":"API/training/trainerclassifier/","text":"TrainerClassifier class atom.training. TrainerClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluates the models to the data in the pipeline. The following steps are applied: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainerClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerClassifier Estimator instance. Example from atom.training import TrainerClassifier # Run the pipeline trainer = TrainerClassifier(['Tree', 'RF'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.scoring('auc') trainer.Tree.plot_bo()","title":"TrainerClassifier"},{"location":"API/training/trainerclassifier/#trainerclassifier","text":"class atom.training. TrainerClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluates the models to the data in the pipeline. The following steps are applied: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainerClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"TrainerClassifier"},{"location":"API/training/trainerclassifier/#attributes","text":"","title":"Attributes"},{"location":"API/training/trainerclassifier/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column.","title":"Data attributes"},{"location":"API/training/trainerclassifier/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/trainerclassifier/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/trainerclassifier/#methods","text":"calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerClassifier Estimator instance.","title":"Methods"},{"location":"API/training/trainerclassifier/#example","text":"from atom.training import TrainerClassifier # Run the pipeline trainer = TrainerClassifier(['Tree', 'RF'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.scoring('auc') trainer.Tree.plot_bo()","title":"Example"},{"location":"API/training/trainerregressor/","text":"TrainerRegressor class atom.training. TrainerRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluates the models to the data in the pipeline. The following steps are applied: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainerRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance. Example from atom.training import TrainerRegressor # Run the pipeline trainer = TrainerRegressor(['OLS', 'BR'], n_calls=5, n_initial_points=3, bagging=5) trainer.run(train, test) # Analyze the results trainer.scoring('mse') trainer.plot_bagging()","title":"TrainerRegressor"},{"location":"API/training/trainerregressor/#trainerregressor","text":"class atom.training. TrainerRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluates the models to the data in the pipeline. The following steps are applied: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainerRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"TrainerRegressor"},{"location":"API/training/trainerregressor/#attributes","text":"","title":"Attributes"},{"location":"API/training/trainerregressor/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column.","title":"Data attributes"},{"location":"API/training/trainerregressor/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/trainerregressor/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/trainerregressor/#methods","text":"clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance.","title":"Methods"},{"location":"API/training/trainerregressor/#example","text":"from atom.training import TrainerRegressor # Run the pipeline trainer = TrainerRegressor(['OLS', 'BR'], n_calls=5, n_initial_points=3, bagging=5) trainer.run(train, test) # Analyze the results trainer.scoring('mse') trainer.plot_bagging()","title":"Example"},{"location":"API/training/trainsizingclassifier/","text":"TrainSizingClassifier class atom.training. TrainSizingClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a train sizing fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainSizingClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. train_sizes: sequence, optional (default=np.linspace(0.2, 1.0, 5)) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the value is <=1, it is interpreted as a fraction of the maximum size of the training set. If the value is > 1, it is interpreted as the total number of samples in the set. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use default name (TrainSizingClassifier). save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainSizingClassifier Estimator instance. Example from atom.training import TrainSizingClassifier # Run the pipeline trainer = TrainSizingClassifier('RF', n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_learning_curve()","title":"TrainSizingClassifier"},{"location":"API/training/trainsizingclassifier/#trainsizingclassifier","text":"class atom.training. TrainSizingClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a train sizing fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainSizingClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. train_sizes: sequence, optional (default=np.linspace(0.2, 1.0, 5)) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the value is <=1, it is interpreted as a fraction of the maximum size of the training set. If the value is > 1, it is interpreted as the total number of samples in the set. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"TrainSizingClassifier"},{"location":"API/training/trainsizingclassifier/#attributes","text":"","title":"Attributes"},{"location":"API/training/trainsizingclassifier/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column.","title":"Data attributes"},{"location":"API/training/trainsizingclassifier/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/trainsizingclassifier/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/trainsizingclassifier/#methods","text":"calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use default name (TrainSizingClassifier). save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainSizingClassifier Estimator instance.","title":"Methods"},{"location":"API/training/trainsizingclassifier/#example","text":"from atom.training import TrainSizingClassifier # Run the pipeline trainer = TrainSizingClassifier('RF', n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_learning_curve()","title":"Example"},{"location":"API/training/trainsizingregressor/","text":"TrainSizingRegressor class atom.training. TrainSizingRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a train sizing fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainSizingRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. train_sizes: sequence, optional (default=np.linspace(0.2, 1.0, 5)) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the value is <=1, it is interpreted as a fraction of the maximum size of the training set. If the value is > 1, it is interpreted as the total number of samples in the set. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance. Example from atom.training import TrainSizingRegressor # Run the pipeline trainer = TrainSizingRegressor('RF', n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_learning_curve()","title":"TrainSizingRegressor"},{"location":"API/training/trainsizingregressor/#trainsizingregressor","text":"class atom.training. TrainSizingRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a train sizing fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainSizingRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. train_sizes: sequence, optional (default=np.linspace(0.2, 1.0, 5)) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the value is <=1, it is interpreted as a fraction of the maximum size of the training set. If the value is > 1, it is interpreted as the total number of samples in the set. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"TrainSizingRegressor"},{"location":"API/training/trainsizingregressor/#attributes","text":"","title":"Attributes"},{"location":"API/training/trainsizingregressor/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column.","title":"Data attributes"},{"location":"API/training/trainsizingregressor/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/trainsizingregressor/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/trainsizingregressor/#methods","text":"clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance.","title":"Methods"},{"location":"API/training/trainsizingregressor/#example","text":"from atom.training import TrainSizingRegressor # Run the pipeline trainer = TrainSizingRegressor('RF', n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_learning_curve()","title":"Example"},{"location":"examples/binary_classification/binary_classification/","text":"Binary classification This example shows how we can use ATOM to perform a variety of data cleaning steps in order to prepare the data for modelling. Then, we compare the prediction performance of an Extra-Trees and a Random Forest. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow . Load the data # Import packages import pandas as pd from sklearn.metrics import fbeta_score from atom import ATOMClassifier # Load data X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 135379 AliceSprings 22.4 35.4 0.0 4.8 11.2 ESE 33.0 55572 Ballarat 11.7 19.8 0.0 NaN NaN NNE 48.0 111664 Witchcliffe 3.9 15.4 5.6 NaN NaN NW 43.0 6661 Cobar 21.6 34.9 0.0 11.2 NaN NNE 41.0 78634 Watsonia 13.6 33.3 0.0 8.0 12.3 N 37.0 Run the pipeline # Call ATOM using only 5% of the complete dataset (for explanatory purposes) atom = ATOMClassifier(X, y='RainTomorrow', n_rows=0.05, n_jobs=8, warnings=False, verbose=2, random_state=1) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 8 cores. Applying data cleaning... Dataset stats ================= >> Shape: (7110, 22) Missing values: 15896 Categorical columns: 5 Scaled: False ---------------------------------- Train set size: 5688 Test set size: 1422 ---------------------------------- Train set balance: No:Yes <==> 3.7:1.0 Test set balance: No:Yes <==> 4.1:1.0 ---------------------------------- Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 5615 | 4473 | 1142 | | 1: Yes | 1495 | 1215 | 280 | # We can change the data attributes in between the pipeline # Note that we can only replace it with a new dataframe! atom.X = atom.X.assign(AvgTemp=(atom.X['MaxTemp'] + atom.X['MinTemp'])/2) # This will automatically update all other data attributes assert 'AvgTemp' in atom.dataset # Impute missing values atom.impute(strat_num='knn', strat_cat='drop', min_frac_rows=0.8) Fitting Imputer... Imputing missing values... --> Dropping 778 rows for containing less than 80% non-missing values. --> Imputing 5 missing values using the KNN imputer in feature MinTemp. --> Imputing 3 missing values using the KNN imputer in feature MaxTemp. --> Imputing 31 missing values using the KNN imputer in feature Rainfall. --> Imputing 2314 missing values using the KNN imputer in feature Evaporation. --> Imputing 2645 missing values using the KNN imputer in feature Sunshine. --> Dropping 201 rows due to missing values in feature WindGustDir. --> Dropping 358 rows due to missing values in feature WindDir9am. --> Dropping 15 rows due to missing values in feature WindDir3pm. --> Imputing 17 missing values using the KNN imputer in feature Humidity9am. --> Imputing 52 missing values using the KNN imputer in feature Humidity3pm. --> Imputing 37 missing values using the KNN imputer in feature Pressure9am. --> Imputing 34 missing values using the KNN imputer in feature Pressure3pm. --> Imputing 1891 missing values using the KNN imputer in feature Cloud9am. --> Imputing 1977 missing values using the KNN imputer in feature Cloud3pm. --> Imputing 4 missing values using the KNN imputer in feature Temp9am. --> Imputing 31 missing values using the KNN imputer in feature Temp3pm. --> Dropping 30 rows due to missing values in feature RainToday. --> Imputing 4 missing values using the KNN imputer in feature AvgTemp. # Encode the categorical features atom.encode(strategy='CatBoost', max_onehot=10, frac_to_other=0.04) Fitting Encoder... Encoding categorical columns... --> CatBoost-encoding feature Location. Contains 1 unique categories. --> CatBoost-encoding feature WindGustDir. Contains 16 unique categories. --> CatBoost-encoding feature WindDir9am. Contains 16 unique categories. --> CatBoost-encoding feature WindDir3pm. Contains 16 unique categories. --> Label-encoding feature RainToday. Contains 2 unique categories. # Perform undersampling of the majority class atom.balance(strategy='smote', sampling_strategy=0.9) atom.stats() # Note the balanced training set Oversampling with SMOTE... --> Adding 2302 rows to category: Yes. Dataset stats ================= >> Shape: (8030, 23) Scaled: False ---------------------------------- Train set size: 6885 Test set size: 1145 ---------------------------------- Train set balance: No:Yes <==> 1.1:1.0 Test set balance: No:Yes <==> 4.1:1.0 ---------------------------------- Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 4543 | 3624 | 919 | | 1: Yes | 3487 | 3261 | 226 | # Define a custom metric def f2_score(y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) # Fit the EXtra-Trees and Random Forest to the data atom.run(models=['et', 'rf'], metric=f2_score, n_calls=0, bagging=5, verbose=1) Running pipeline ============================= >> Models in pipeline: ET, RF Metric: f2_score Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> f2_score: 1.0000 Score on the test set --> f2_score: 0.5474 Time elapsed: 0.191s Bagging ----------------------------------------- Score --> f2_score: 0.6027 \u00b1 0.0190 Time elapsed: 0.843s ------------------------------------------------- Total time: 1.038s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> f2_score: 1.0000 Score on the test set --> f2_score: 0.5959 Time elapsed: 0.295s Bagging ----------------------------------------- Score --> f2_score: 0.6087 \u00b1 0.0113 Time elapsed: 1.291s ------------------------------------------------- Total time: 1.589s Final results ========================= >> Duration: 2.627s ------------------------------------------ Extra-Trees --> f2_score: 0.603 \u00b1 0.019 ~ Random Forest --> f2_score: 0.609 \u00b1 0.011 ~ ! Analyze the results # Let's have a look at the final scoring atom.scoring() # The winning model is indicated with a ! and can be accessed through the winner attribute # The ~ indicates that the model is probably overfitting. If we look at the train and test # score we see a difference of more than 20% print(f'\\n\\nAnd the winner is the {atom.winner.longname} model!!') print('Score on the training set: ', atom.winner.metric_train) print('Score on the test set: ', atom.winner.metric_test) Results ===================== >> Extra-Trees --> f2_score: 0.603 \u00b1 0.019 ~ Random Forest --> f2_score: 0.609 \u00b1 0.011 ~ And the winner is the Random Forest model!! Score on the training set: 1.0 Score on the test set: 0.5958781362007168 We can make many plots to check the performance of the models # The probabilties plot shows the distribution of predicted # probabilities for the positive class atom.winner.plot_probabilities() # The threshold plot let us compare how different metrics # perform for different thresholds atom.winner.plot_threshold(metric=['f1', 'accuracy', 'average_precision'], steps=50, filename='thresholds.png') # The ROC and PRC curve are also typical ways of measuring performance atom.plot_roc(title=\"ROC for the LightGBM vs CatBoost model\") atom.plot_prc(title=\"PRC comparison of the models\")","title":"Binary classification"},{"location":"examples/binary_classification/binary_classification/#binary-classification","text":"This example shows how we can use ATOM to perform a variety of data cleaning steps in order to prepare the data for modelling. Then, we compare the prediction performance of an Extra-Trees and a Random Forest. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow .","title":"Binary classification"},{"location":"examples/binary_classification/binary_classification/#load-the-data","text":"# Import packages import pandas as pd from sklearn.metrics import fbeta_score from atom import ATOMClassifier # Load data X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 135379 AliceSprings 22.4 35.4 0.0 4.8 11.2 ESE 33.0 55572 Ballarat 11.7 19.8 0.0 NaN NaN NNE 48.0 111664 Witchcliffe 3.9 15.4 5.6 NaN NaN NW 43.0 6661 Cobar 21.6 34.9 0.0 11.2 NaN NNE 41.0 78634 Watsonia 13.6 33.3 0.0 8.0 12.3 N 37.0","title":"Load the data"},{"location":"examples/binary_classification/binary_classification/#run-the-pipeline","text":"# Call ATOM using only 5% of the complete dataset (for explanatory purposes) atom = ATOMClassifier(X, y='RainTomorrow', n_rows=0.05, n_jobs=8, warnings=False, verbose=2, random_state=1) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 8 cores. Applying data cleaning... Dataset stats ================= >> Shape: (7110, 22) Missing values: 15896 Categorical columns: 5 Scaled: False ---------------------------------- Train set size: 5688 Test set size: 1422 ---------------------------------- Train set balance: No:Yes <==> 3.7:1.0 Test set balance: No:Yes <==> 4.1:1.0 ---------------------------------- Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 5615 | 4473 | 1142 | | 1: Yes | 1495 | 1215 | 280 | # We can change the data attributes in between the pipeline # Note that we can only replace it with a new dataframe! atom.X = atom.X.assign(AvgTemp=(atom.X['MaxTemp'] + atom.X['MinTemp'])/2) # This will automatically update all other data attributes assert 'AvgTemp' in atom.dataset # Impute missing values atom.impute(strat_num='knn', strat_cat='drop', min_frac_rows=0.8) Fitting Imputer... Imputing missing values... --> Dropping 778 rows for containing less than 80% non-missing values. --> Imputing 5 missing values using the KNN imputer in feature MinTemp. --> Imputing 3 missing values using the KNN imputer in feature MaxTemp. --> Imputing 31 missing values using the KNN imputer in feature Rainfall. --> Imputing 2314 missing values using the KNN imputer in feature Evaporation. --> Imputing 2645 missing values using the KNN imputer in feature Sunshine. --> Dropping 201 rows due to missing values in feature WindGustDir. --> Dropping 358 rows due to missing values in feature WindDir9am. --> Dropping 15 rows due to missing values in feature WindDir3pm. --> Imputing 17 missing values using the KNN imputer in feature Humidity9am. --> Imputing 52 missing values using the KNN imputer in feature Humidity3pm. --> Imputing 37 missing values using the KNN imputer in feature Pressure9am. --> Imputing 34 missing values using the KNN imputer in feature Pressure3pm. --> Imputing 1891 missing values using the KNN imputer in feature Cloud9am. --> Imputing 1977 missing values using the KNN imputer in feature Cloud3pm. --> Imputing 4 missing values using the KNN imputer in feature Temp9am. --> Imputing 31 missing values using the KNN imputer in feature Temp3pm. --> Dropping 30 rows due to missing values in feature RainToday. --> Imputing 4 missing values using the KNN imputer in feature AvgTemp. # Encode the categorical features atom.encode(strategy='CatBoost', max_onehot=10, frac_to_other=0.04) Fitting Encoder... Encoding categorical columns... --> CatBoost-encoding feature Location. Contains 1 unique categories. --> CatBoost-encoding feature WindGustDir. Contains 16 unique categories. --> CatBoost-encoding feature WindDir9am. Contains 16 unique categories. --> CatBoost-encoding feature WindDir3pm. Contains 16 unique categories. --> Label-encoding feature RainToday. Contains 2 unique categories. # Perform undersampling of the majority class atom.balance(strategy='smote', sampling_strategy=0.9) atom.stats() # Note the balanced training set Oversampling with SMOTE... --> Adding 2302 rows to category: Yes. Dataset stats ================= >> Shape: (8030, 23) Scaled: False ---------------------------------- Train set size: 6885 Test set size: 1145 ---------------------------------- Train set balance: No:Yes <==> 1.1:1.0 Test set balance: No:Yes <==> 4.1:1.0 ---------------------------------- Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 4543 | 3624 | 919 | | 1: Yes | 3487 | 3261 | 226 | # Define a custom metric def f2_score(y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) # Fit the EXtra-Trees and Random Forest to the data atom.run(models=['et', 'rf'], metric=f2_score, n_calls=0, bagging=5, verbose=1) Running pipeline ============================= >> Models in pipeline: ET, RF Metric: f2_score Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> f2_score: 1.0000 Score on the test set --> f2_score: 0.5474 Time elapsed: 0.191s Bagging ----------------------------------------- Score --> f2_score: 0.6027 \u00b1 0.0190 Time elapsed: 0.843s ------------------------------------------------- Total time: 1.038s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> f2_score: 1.0000 Score on the test set --> f2_score: 0.5959 Time elapsed: 0.295s Bagging ----------------------------------------- Score --> f2_score: 0.6087 \u00b1 0.0113 Time elapsed: 1.291s ------------------------------------------------- Total time: 1.589s Final results ========================= >> Duration: 2.627s ------------------------------------------ Extra-Trees --> f2_score: 0.603 \u00b1 0.019 ~ Random Forest --> f2_score: 0.609 \u00b1 0.011 ~ !","title":"Run the pipeline"},{"location":"examples/binary_classification/binary_classification/#analyze-the-results","text":"# Let's have a look at the final scoring atom.scoring() # The winning model is indicated with a ! and can be accessed through the winner attribute # The ~ indicates that the model is probably overfitting. If we look at the train and test # score we see a difference of more than 20% print(f'\\n\\nAnd the winner is the {atom.winner.longname} model!!') print('Score on the training set: ', atom.winner.metric_train) print('Score on the test set: ', atom.winner.metric_test) Results ===================== >> Extra-Trees --> f2_score: 0.603 \u00b1 0.019 ~ Random Forest --> f2_score: 0.609 \u00b1 0.011 ~ And the winner is the Random Forest model!! Score on the training set: 1.0 Score on the test set: 0.5958781362007168 We can make many plots to check the performance of the models # The probabilties plot shows the distribution of predicted # probabilities for the positive class atom.winner.plot_probabilities() # The threshold plot let us compare how different metrics # perform for different thresholds atom.winner.plot_threshold(metric=['f1', 'accuracy', 'average_precision'], steps=50, filename='thresholds.png') # The ROC and PRC curve are also typical ways of measuring performance atom.plot_roc(title=\"ROC for the LightGBM vs CatBoost model\") atom.plot_prc(title=\"PRC comparison of the models\")","title":"Analyze the results"},{"location":"examples/calibration/calibration/","text":"Calibration This example shows us how to use the calibration method to calibrate a classifier. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow . Load the data # Import packages import pandas as pd from atom import ATOMClassifier # Get the dataset's features and targets X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 40667 Williamtown 10.0 20.4 0.0 5.4 NaN NW 48.0 43490 Wollongong 15.0 22.0 0.4 NaN NaN SSW 59.0 102419 Nuriootpa 2.6 23.9 0.0 8.0 12.8 ESE 35.0 123437 SalmonGums 3.4 18.0 0.0 NaN NaN WSW 33.0 18121 NorahHead 16.5 22.3 0.0 NaN NaN S 46.0 Run the pipeline # Initialize the ATOM class atom = ATOMClassifier(X, y='RainTomorrow', n_rows=1e4, verbose=1, warnings='ignore', random_state=1) # Handle missing values and categorical columns in the dataset atom.impute(strat_num='median', strat_cat='most_frequent') atom.encode(strategy='target', max_onehot=5, frac_to_other=0.05) # Fit a linear SVM to the data atom.run('lsvm') << ================== ATOM ================== >> Algorithm task: binary classification. Applying data cleaning... Dataset stats ================= >> Shape: (10000, 22) Missing values: 22613 Categorical columns: 5 Scaled: False ---------------------------------- Train set size: 8000 Test set size: 2000 Fitting Imputer... Imputing missing values... Fitting Encoder... Encoding categorical columns... Running pipeline ============================= >> Models in pipeline: lSVM Metric: f1 Results for Linear SVM: Fitting ----------------------------------------- Score on the train set --> f1: 0.5639 Score on the test set --> f1: 0.5929 Time elapsed: 0.444s ------------------------------------------------- Total time: 0.444s Final results ========================= >> Duration: 0.444s ------------------------------------------ Linear SVM --> f1: 0.593 Analyze the results # Check our model's calibration atom.plot_calibration() # Let's try to improve it using the calibrate method atom.calibrate(method='isotonic', cv=5) atom.plot_calibration()","title":"Calibration"},{"location":"examples/calibration/calibration/#calibration","text":"This example shows us how to use the calibration method to calibrate a classifier. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow .","title":"Calibration"},{"location":"examples/calibration/calibration/#load-the-data","text":"# Import packages import pandas as pd from atom import ATOMClassifier # Get the dataset's features and targets X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 40667 Williamtown 10.0 20.4 0.0 5.4 NaN NW 48.0 43490 Wollongong 15.0 22.0 0.4 NaN NaN SSW 59.0 102419 Nuriootpa 2.6 23.9 0.0 8.0 12.8 ESE 35.0 123437 SalmonGums 3.4 18.0 0.0 NaN NaN WSW 33.0 18121 NorahHead 16.5 22.3 0.0 NaN NaN S 46.0","title":"Load the data"},{"location":"examples/calibration/calibration/#run-the-pipeline","text":"# Initialize the ATOM class atom = ATOMClassifier(X, y='RainTomorrow', n_rows=1e4, verbose=1, warnings='ignore', random_state=1) # Handle missing values and categorical columns in the dataset atom.impute(strat_num='median', strat_cat='most_frequent') atom.encode(strategy='target', max_onehot=5, frac_to_other=0.05) # Fit a linear SVM to the data atom.run('lsvm') << ================== ATOM ================== >> Algorithm task: binary classification. Applying data cleaning... Dataset stats ================= >> Shape: (10000, 22) Missing values: 22613 Categorical columns: 5 Scaled: False ---------------------------------- Train set size: 8000 Test set size: 2000 Fitting Imputer... Imputing missing values... Fitting Encoder... Encoding categorical columns... Running pipeline ============================= >> Models in pipeline: lSVM Metric: f1 Results for Linear SVM: Fitting ----------------------------------------- Score on the train set --> f1: 0.5639 Score on the test set --> f1: 0.5929 Time elapsed: 0.444s ------------------------------------------------- Total time: 0.444s Final results ========================= >> Duration: 0.444s ------------------------------------------ Linear SVM --> f1: 0.593","title":"Run the pipeline"},{"location":"examples/calibration/calibration/#analyze-the-results","text":"# Check our model's calibration atom.plot_calibration() # Let's try to improve it using the calibrate method atom.calibrate(method='isotonic', cv=5) atom.plot_calibration()","title":"Analyze the results"},{"location":"examples/early_stopping/early_stopping/","text":"Early stopping This example shows how we can use early stopping to reduce the time it takes to run the pipeline. This option is only available for models that allow in-training evaluation (XGBoost, LightGBM and CatBoost). Import the breast cancer dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not. Load the data # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier # Get the dataset's features and targets X, y = load_breast_cancer(return_X_y=True) Run the pipeline # Start ATOM and fit the models using early stopping # An early stopping of 0.1 means that the model will stop if it # didn't improve in the last 10% of it's iterations. atom = ATOMClassifier(X, y, n_jobs=2, verbose=2, warnings=False, random_state=1) atom.run('LGB', metric='ap', n_calls=7, n_initial_points=3, bo_params={'early_stopping': 0.1, 'cv': 1}) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 2 cores. Applying data cleaning... Dataset stats ================= >> Shape: (569, 31) Scaled: False ---------------------------------- Train set size: 456 Test set size: 113 ---------------------------------- Train set balance: 0:1 <==> 0.6:1.0 Test set balance: 0:1 <==> 0.7:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 212 | 167 | 45 | | 1 | 357 | 289 | 68 | Running pipeline ============================= >> Models in pipeline: LGB Metric: average_precision Running BO for LightGBM... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 499, 'learning_rate': 0.73, 'max_depth': 2, 'num_leaves': 40, 'min_child_weight': 5, 'min_child_samples': 18, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 100.0, 'reg_lambda': 10.0} Early stop at iteration 50 of 499. Evaluation --> average_precision: 0.6304 Best average_precision: 0.6304 Time iteration: 0.031s Total time: 0.047s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 170, 'learning_rate': 0.11, 'max_depth': 5, 'num_leaves': 25, 'min_child_weight': 11, 'min_child_samples': 28, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_alpha': 100.0, 'reg_lambda': 10.0} Early stop at iteration 18 of 170. Evaluation --> average_precision: 0.6304 Best average_precision: 0.6304 Time iteration: 0.028s Total time: 0.075s Random start 3 ---------------------------------- Parameters --> {'n_estimators': 364, 'learning_rate': 0.4, 'max_depth': 2, 'num_leaves': 30, 'min_child_weight': 17, 'min_child_samples': 27, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_alpha': 0.0, 'reg_lambda': 1.0} Early stop at iteration 42 of 364. Evaluation --> average_precision: 0.9819 Best average_precision: 0.9819 Time iteration: 0.020s Total time: 0.099s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 238, 'learning_rate': 0.49, 'max_depth': 3, 'num_leaves': 29, 'min_child_weight': 18, 'min_child_samples': 25, 'subsample': 0.9, 'colsample_bytree': 0.4, 'reg_alpha': 0.0, 'reg_lambda': 10.0} Early stop at iteration 30 of 238. Evaluation --> average_precision: 0.9911 Best average_precision: 0.9911 Time iteration: 0.016s Total time: 1.343s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 31, 'learning_rate': 0.07, 'max_depth': 6, 'num_leaves': 21, 'min_child_weight': 18, 'min_child_samples': 28, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_alpha': 0.0, 'reg_lambda': 100.0} Evaluation --> average_precision: 0.9920 Best average_precision: 0.9920 Time iteration: 0.016s Total time: 1.762s Iteration 6 ------------------------------------- Parameters --> {'n_estimators': 20, 'learning_rate': 1.0, 'max_depth': 3, 'num_leaves': 40, 'min_child_weight': 20, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.3, 'reg_alpha': 0.0, 'reg_lambda': 100.0} Early stop at iteration 12 of 20. Evaluation --> average_precision: 0.9953 Best average_precision: 0.9953 Time iteration: 0.016s Total time: 2.178s Iteration 7 ------------------------------------- Parameters --> {'n_estimators': 69, 'learning_rate': 0.17, 'max_depth': 7, 'num_leaves': 26, 'min_child_weight': 17, 'min_child_samples': 14, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0} Early stop at iteration 22 of 69. Evaluation --> average_precision: 0.9978 Best average_precision: 0.9978 Time iteration: 0.016s Total time: 2.499s Results for LightGBM: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 69, 'learning_rate': 0.17, 'max_depth': 7, 'num_leaves': 26, 'min_child_weight': 17, 'min_child_samples': 14, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0} Best evaluation --> average_precision: 0.9978 Time elapsed: 2.912s Fitting ----------------------------------------- Early stop at iteration 27 of 69. Score on the train set --> average_precision: 0.9962 Score on the test set --> average_precision: 0.9712 Time elapsed: 0.016s ------------------------------------------------- Total time: 2.928s Final results ========================= >> Duration: 2.928s ------------------------------------------ LightGBM --> average_precision: 0.971 Analyze the results # For these models, we can plot the evaluation on the train and test set during training # Note that the metric is provided by the model's library, not ATOM! atom.lgb.plot_evals(title=\"LightGBM's evaluation curve\", figsize=(11, 9))","title":"Early stopping"},{"location":"examples/early_stopping/early_stopping/#early-stopping","text":"This example shows how we can use early stopping to reduce the time it takes to run the pipeline. This option is only available for models that allow in-training evaluation (XGBoost, LightGBM and CatBoost). Import the breast cancer dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.","title":"Early stopping"},{"location":"examples/early_stopping/early_stopping/#load-the-data","text":"# Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier # Get the dataset's features and targets X, y = load_breast_cancer(return_X_y=True)","title":"Load the data"},{"location":"examples/early_stopping/early_stopping/#run-the-pipeline","text":"# Start ATOM and fit the models using early stopping # An early stopping of 0.1 means that the model will stop if it # didn't improve in the last 10% of it's iterations. atom = ATOMClassifier(X, y, n_jobs=2, verbose=2, warnings=False, random_state=1) atom.run('LGB', metric='ap', n_calls=7, n_initial_points=3, bo_params={'early_stopping': 0.1, 'cv': 1}) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 2 cores. Applying data cleaning... Dataset stats ================= >> Shape: (569, 31) Scaled: False ---------------------------------- Train set size: 456 Test set size: 113 ---------------------------------- Train set balance: 0:1 <==> 0.6:1.0 Test set balance: 0:1 <==> 0.7:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 212 | 167 | 45 | | 1 | 357 | 289 | 68 | Running pipeline ============================= >> Models in pipeline: LGB Metric: average_precision Running BO for LightGBM... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 499, 'learning_rate': 0.73, 'max_depth': 2, 'num_leaves': 40, 'min_child_weight': 5, 'min_child_samples': 18, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 100.0, 'reg_lambda': 10.0} Early stop at iteration 50 of 499. Evaluation --> average_precision: 0.6304 Best average_precision: 0.6304 Time iteration: 0.031s Total time: 0.047s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 170, 'learning_rate': 0.11, 'max_depth': 5, 'num_leaves': 25, 'min_child_weight': 11, 'min_child_samples': 28, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_alpha': 100.0, 'reg_lambda': 10.0} Early stop at iteration 18 of 170. Evaluation --> average_precision: 0.6304 Best average_precision: 0.6304 Time iteration: 0.028s Total time: 0.075s Random start 3 ---------------------------------- Parameters --> {'n_estimators': 364, 'learning_rate': 0.4, 'max_depth': 2, 'num_leaves': 30, 'min_child_weight': 17, 'min_child_samples': 27, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_alpha': 0.0, 'reg_lambda': 1.0} Early stop at iteration 42 of 364. Evaluation --> average_precision: 0.9819 Best average_precision: 0.9819 Time iteration: 0.020s Total time: 0.099s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 238, 'learning_rate': 0.49, 'max_depth': 3, 'num_leaves': 29, 'min_child_weight': 18, 'min_child_samples': 25, 'subsample': 0.9, 'colsample_bytree': 0.4, 'reg_alpha': 0.0, 'reg_lambda': 10.0} Early stop at iteration 30 of 238. Evaluation --> average_precision: 0.9911 Best average_precision: 0.9911 Time iteration: 0.016s Total time: 1.343s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 31, 'learning_rate': 0.07, 'max_depth': 6, 'num_leaves': 21, 'min_child_weight': 18, 'min_child_samples': 28, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_alpha': 0.0, 'reg_lambda': 100.0} Evaluation --> average_precision: 0.9920 Best average_precision: 0.9920 Time iteration: 0.016s Total time: 1.762s Iteration 6 ------------------------------------- Parameters --> {'n_estimators': 20, 'learning_rate': 1.0, 'max_depth': 3, 'num_leaves': 40, 'min_child_weight': 20, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.3, 'reg_alpha': 0.0, 'reg_lambda': 100.0} Early stop at iteration 12 of 20. Evaluation --> average_precision: 0.9953 Best average_precision: 0.9953 Time iteration: 0.016s Total time: 2.178s Iteration 7 ------------------------------------- Parameters --> {'n_estimators': 69, 'learning_rate': 0.17, 'max_depth': 7, 'num_leaves': 26, 'min_child_weight': 17, 'min_child_samples': 14, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0} Early stop at iteration 22 of 69. Evaluation --> average_precision: 0.9978 Best average_precision: 0.9978 Time iteration: 0.016s Total time: 2.499s Results for LightGBM: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 69, 'learning_rate': 0.17, 'max_depth': 7, 'num_leaves': 26, 'min_child_weight': 17, 'min_child_samples': 14, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0} Best evaluation --> average_precision: 0.9978 Time elapsed: 2.912s Fitting ----------------------------------------- Early stop at iteration 27 of 69. Score on the train set --> average_precision: 0.9962 Score on the test set --> average_precision: 0.9712 Time elapsed: 0.016s ------------------------------------------------- Total time: 2.928s Final results ========================= >> Duration: 2.928s ------------------------------------------ LightGBM --> average_precision: 0.971","title":"Run the pipeline"},{"location":"examples/early_stopping/early_stopping/#analyze-the-results","text":"# For these models, we can plot the evaluation on the train and test set during training # Note that the metric is provided by the model's library, not ATOM! atom.lgb.plot_evals(title=\"LightGBM's evaluation curve\", figsize=(11, 9))","title":"Analyze the results"},{"location":"examples/feature_engineering/feature_engineering/","text":"Feature engineering This example shows how to use automated feature generation to improve your model's performance. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow . Load the data # Import packages import pandas as pd from atom import ATOMClassifier # Load data X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 36171 WaggaWagga 14.3 21.4 0.8 10.6 5.8 W 52.0 44425 Canberra 16.0 22.8 0.0 12.4 6.0 E 50.0 126238 Walpole 13.8 20.7 4.8 NaN NaN NW 33.0 54550 Ballarat 3.3 14.7 0.0 NaN NaN N 46.0 85638 Cairns 23.5 31.5 43.8 0.8 8.5 SSE 52.0 Run the pipeline # Initiate ATOM and apply data cleaning atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0, random_state=1) atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) atom.encode(max_onehot=10, frac_to_other=0.04) # Let's see how a LightGBM model performs without adding additional features atom.run('LGB', metric='auc') atom.scoring() is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead Results ===================== >> LightGBM --> roc_auc: 0.878 # What are the most important fetaures? atom.plot_feature_importance(show=10) Now let's create some new fetaures using Deep Feature Synthesis atom.verbose = 2 # Increase verbosity to see the output # Create 100 new features using DFS atom.feature_generation(strategy='dfs', n_features=100, operators=['add', 'sub', 'log', 'sqrt']) Fitting FeatureGenerator... Creating new features... --> 100 new features were added to the dataset. divide by zero encountered in log invalid value encountered in log # The warnings warn us that some operators created missing values! # We can see the columns with missing values using the missing attribute atom.missing # We can easily turn off warnings in the future atom.warnings = False # We can use the impute method again atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) Fitting Imputer... Imputing missing values... --> Imputing 577 missing values using the KNN imputer in feature LOG(Cloud9am). --> Dropping feature LOG(RainToday_other) for containing 8873 (99%) missing values. --> Imputing 148 missing values using the KNN imputer in feature LOG(Sunshine). --> Imputing 6 missing values using the KNN imputer in feature LOG(Temp9am). --> Imputing 33 missing values using the KNN imputer in feature LOG(WindSpeed3pm). # 100 new features may be to much... # Let's check for multicollinearity and use RFECV to reduce the number even further atom.feature_selection(strategy='RFECV', solver='lgb', n_features=30, scoring='auc', max_correlation=0.98) Fitting FeatureSelector... Performing feature selection ... --> Feature Location was removed due to low variance. Value 0.2077375946173255 repeated in 100% of the rows. --> Feature Cloud3pm + Humidity3pm was removed due to collinearity with another feature. --> Feature Cloud3pm + RainToday_No was removed due to collinearity with another feature. --> Feature Cloud3pm + WindDir9am was removed due to collinearity with another feature. --> Feature Cloud3pm - Location was removed due to collinearity with another feature. --> Feature Cloud3pm - RainToday_No was removed due to collinearity with another feature. --> Feature Cloud9am + WindGustDir was removed due to collinearity with another feature. --> Feature Evaporation + Location was removed due to collinearity with another feature. --> Feature Evaporation + WindGustDir was removed due to collinearity with another feature. --> Feature Evaporation - WindDir3pm was removed due to collinearity with another feature. --> Feature Humidity3pm - RainToday_No was removed due to collinearity with another feature. --> Feature Humidity3pm - Sunshine was removed due to collinearity with another feature. --> Feature Humidity9am + RainToday_Yes was removed due to collinearity with another feature. --> Feature Humidity9am - RainToday_No was removed due to collinearity with another feature. --> Feature Humidity9am - Sunshine was removed due to collinearity with another feature. --> Feature LOG(MaxTemp) was removed due to collinearity with another feature. --> Feature Location + MinTemp was removed due to collinearity with another feature. --> Feature Location + RainToday_No was removed due to collinearity with another feature. --> Feature Location + WindDir3pm was removed due to collinearity with another feature. --> Feature Location + WindGustDir was removed due to collinearity with another feature. --> Feature Location + WindSpeed3pm was removed due to collinearity with another feature. --> Feature Location - RainToday_Yes was removed due to collinearity with another feature. --> Feature MaxTemp + RainToday_No was removed due to collinearity with another feature. --> Feature MaxTemp + RainToday_Yes was removed due to collinearity with another feature. --> Feature MinTemp + WindGustDir was removed due to collinearity with another feature. --> Feature Pressure3pm + RainToday_other was removed due to collinearity with another feature. --> Feature Pressure3pm + Temp3pm was removed due to collinearity with another feature. --> Feature Pressure3pm - WindGustDir was removed due to collinearity with another feature. --> Feature Pressure9am - WindGustDir was removed due to collinearity with another feature. --> Feature RainToday_No + Temp9am was removed due to collinearity with another feature. --> Feature RainToday_No + WindGustDir was removed due to collinearity with another feature. --> Feature RainToday_No - WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_Yes + Temp9am was removed due to collinearity with another feature. --> Feature RainToday_Yes + WindDir3pm was removed due to collinearity with another feature. --> Feature RainToday_Yes + WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_Yes - WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_other - Temp9am was removed due to collinearity with another feature. --> Feature RainToday_other - WindGustSpeed was removed due to collinearity with another feature. --> Feature RainToday_other - WindSpeed9am was removed due to collinearity with another feature. --> Feature Rainfall + RainToday_No was removed due to collinearity with another feature. --> Feature Rainfall + WindDir9am was removed due to collinearity with another feature. --> Feature Rainfall - WindDir3pm was removed due to collinearity with another feature. --> Feature SQRT(Humidity3pm) was removed due to collinearity with another feature. --> Feature SQRT(Pressure9am) was removed due to collinearity with another feature. --> Feature Sunshine + WindDir9am was removed due to collinearity with another feature. --> Feature Temp3pm + WindDir9am was removed due to collinearity with another feature. --> Feature Temp3pm + WindGustDir was removed due to collinearity with another feature. --> Feature Temp3pm - WindDir3pm was removed due to collinearity with another feature. --> Feature Temp9am - WindDir9am was removed due to collinearity with another feature. --> Feature WindDir3pm - WindSpeed3pm was removed due to collinearity with another feature. --> Feature WindGustDir + WindGustSpeed was removed due to collinearity with another feature. --> Feature WindGustDir - WindSpeed9am was removed due to collinearity with another feature. --> The RFECV selected 64 features from the dataset. >>> Dropping feature RainToday_Yes (rank 3). >>> Dropping feature RainToday_No (rank 5). >>> Dropping feature Location - WindSpeed9am (rank 2). >>> Dropping feature SQRT(Cloud9am) (rank 7). >>> Dropping feature SQRT(Rainfall) (rank 6). >>> Dropping feature SQRT(WindSpeed9am) (rank 4). # The collinear attribute shows what features were removed due to multicollinearity atom.collinear .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } drop_feature correlated_feature correlation_value 0 Cloud3pm + Humidity3pm Humidity3pm 0.99578 1 Cloud3pm + RainToday_No Cloud3pm 0.98122 2 Cloud3pm + WindDir9am Cloud3pm, Cloud3pm + RainToday_No 0.99968, 0.98054 3 Cloud3pm - Location Cloud3pm, Cloud3pm + RainToday_No, Cloud3pm + ... 1.0, 0.98122, 0.99968 4 Cloud3pm - RainToday_No Cloud3pm, Cloud3pm + WindDir9am, Cloud3pm - Lo... 0.98405, 0.98408, 0.98405 5 Cloud9am + WindGustDir Cloud9am 0.99979 6 Evaporation + Location Evaporation 1.0 7 Evaporation + WindGustDir Evaporation, Evaporation + Location 0.9999, 0.9999 8 Evaporation - WindDir3pm Evaporation, Evaporation + Location, Evaporati... 0.9999, 0.9999, 0.99969 9 Humidity3pm - RainToday_No Humidity3pm, Cloud3pm + Humidity3pm 0.99983, 0.99572 10 Humidity3pm - Sunshine Humidity3pm, Cloud3pm + Humidity3pm, Humidity3... 0.99347, 0.99405, 0.9935 11 Humidity9am + RainToday_Yes Humidity9am 0.9998 12 Humidity9am - RainToday_No Humidity9am, Humidity9am + RainToday_Yes 0.9998, 0.99999 13 Humidity9am - Sunshine Humidity9am, Humidity9am + RainToday_Yes, Humi... 0.99165, 0.99183, 0.99184 14 LOG(MaxTemp) MaxTemp 0.98395 15 Location + MinTemp MinTemp 1.0 16 Location + RainToday_No RainToday_Yes, RainToday_No -0.98403, 1.0 17 Location + WindDir3pm WindDir3pm 1.0 18 Location + WindGustDir WindGustDir 1.0 19 Location + WindSpeed3pm WindSpeed3pm 1.0 20 Location - RainToday_Yes RainToday_Yes, RainToday_No, Location + RainTo... -1.0, 0.98403, 0.98403 21 MaxTemp + RainToday_No MaxTemp, LOG(MaxTemp) 0.99841, 0.9831 22 MaxTemp + RainToday_Yes MaxTemp, LOG(MaxTemp), MaxTemp + RainToday_No 0.99834, 0.98156, 0.99356 23 MinTemp + WindGustDir MinTemp, Location + MinTemp 0.99997, 0.99997 24 Pressure3pm + RainToday_other Pressure3pm 0.99995 25 Pressure3pm + Temp3pm MaxTemp + Pressure3pm 0.98005 26 Pressure3pm - WindGustDir Pressure3pm, Pressure3pm + RainToday_other 0.99998, 0.99992 27 Pressure9am - WindGustDir Pressure9am 0.99998 28 RainToday_No + Temp9am Temp9am 0.99797 29 RainToday_No + WindGustDir RainToday_No, Location + RainToday_No 0.9933, 0.9933 30 RainToday_No - WindDir9am RainToday_No, Location + RainToday_No 0.99169, 0.99169 31 RainToday_Yes + Temp9am Temp9am, RainToday_No + Temp9am, RainToday_No ... 0.99795, 0.99191, -0.99993 32 RainToday_Yes + WindDir3pm RainToday_Yes, Location - RainToday_Yes 0.99334, -0.99334 33 RainToday_Yes + WindDir9am RainToday_Yes, Location - RainToday_Yes, RainT... 0.99154, -0.99154, -0.9847, 0.98993 34 RainToday_Yes - WindDir9am RainToday_Yes, Location - RainToday_Yes 0.9911, -0.9911 35 RainToday_other - Temp9am Temp9am, RainToday_No + Temp9am, RainToday_No ... -0.99993, -0.998, 0.99775, -0.99792 36 RainToday_other - WindGustSpeed WindGustSpeed, Cloud9am - WindGustSpeed -0.99998, 0.98438 37 RainToday_other - WindSpeed9am WindSpeed9am, Location - WindSpeed9am -0.99997, 0.99997 38 Rainfall + RainToday_No Rainfall 0.99907 39 Rainfall + WindDir9am Rainfall, Rainfall + RainToday_No 0.99998, 0.99902 40 Rainfall - WindDir3pm Rainfall, Rainfall + RainToday_No, Rainfall + ... 0.99998, 0.99907, 0.99995 41 SQRT(Humidity3pm) Humidity3pm, Cloud3pm + Humidity3pm, Humidity3... 0.98722, 0.98193, 0.98674 42 SQRT(Pressure9am) Pressure9am, Pressure9am - WindGustDir 1.0, 0.99998 43 Sunshine + WindDir9am Sunshine, RainToday_other - Sunshine 0.99982, -0.99948 44 Temp3pm + WindDir9am Temp3pm 0.99997 45 Temp3pm + WindGustDir Temp3pm, Temp3pm + WindDir9am 0.99998, 0.99997 46 Temp3pm - WindDir3pm Temp3pm, Temp3pm + WindDir9am, Temp3pm + WindG... 0.99998, 0.99993, 0.99993 47 Temp9am - WindDir9am Temp9am, RainToday_No + Temp9am, RainToday_No ... 0.99996, 0.99798, -0.99783, 0.99787, -0.9999 48 WindDir3pm - WindSpeed3pm WindSpeed3pm, Location + WindSpeed3pm, Locatio... -0.99998, -0.99998, 0.99998 49 WindGustDir + WindGustSpeed WindGustSpeed, Cloud9am - WindGustSpeed, RainT... 0.99999, -0.9843, -0.99998 50 WindGustDir - WindSpeed9am WindSpeed9am, Location - WindSpeed9am, RainTod... -0.99999, 0.99999, 0.99995 # After applying RFECV, we can plot the score per number of features atom.plot_rfecv() # Let's see how the model performs now atom.run('LGB') Running pipeline ============================= >> Models in pipeline: LGB Metric: roc_auc Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> roc_auc: 0.9962 Score on the test set --> roc_auc: 0.8787 Time elapsed: 0.708s ------------------------------------------------- Total time: 0.722s Final results ========================= >> Duration: 0.723s ------------------------------------------ LightGBM --> roc_auc: 0.879 # Did the feature importance change? atom.plot_feature_importance(show=10) Lets try the same using Genetic Feature Generation atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0, warnings=False, random_state=1) atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) atom.encode(max_onehot=10, frac_to_other=0.04) # Change verbosity to print extended info atom.verbose = 2 # Create new features using Genetic Programming atom.feature_generation(strategy='genetic', n_features=20, generations=10, population=2000) Fitting FeatureGenerator... | Population Average | Best Individual | ---- ------------------------- ------------------------------------------ ---------- Gen Length Fitness Length Fitness OOB Fitness Time Left 0 3.17 0.127531 3 0.50405 N/A 9.52s 1 3.10 0.338627 5 0.536586 N/A 9.04s 2 3.50 0.443734 9 0.541692 N/A 7.65s 3 4.44 0.47684 7 0.54494 N/A 6.89s 4 6.25 0.512037 13 0.546193 N/A 5.76s 5 7.47 0.507736 9 0.550266 N/A 4.62s 6 7.73 0.500405 11 0.55324 N/A 3.56s 7 7.99 0.497944 11 0.553398 N/A 2.38s 8 9.29 0.494223 13 0.554965 N/A 1.29s 9 10.68 0.493684 11 0.553398 N/A 0.00s Creating new features... --> 5 new features were added to the dataset. # We can see the feature's fitness and description through the genetic_features attribute atom.genetic_features .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name description fitness 0 Feature 24 mul(sub(sub(sub(Humidity3pm, Sunshine), Sunshi... 0.542398 1 Feature 25 mul(sub(sub(Humidity3pm, Sunshine), Sunshine),... 0.542240 2 Feature 26 mul(sub(Humidity3pm, Sunshine), mul(sub(sub(Hu... 0.542240 3 Feature 27 mul(mul(sub(Humidity3pm, Sunshine), WindGustSp... 0.542240 4 Feature 28 mul(mul(sub(sub(Humidity3pm, Sunshine), Sunshi... 0.542240 # And fit the model again atom.run('LGB', metric='auc') Running pipeline ============================= >> Models in pipeline: LGB Metric: roc_auc Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> roc_auc: 0.9901 Score on the test set --> roc_auc: 0.8793 Time elapsed: 0.305s ------------------------------------------------- Total time: 0.313s Final results ========================= >> Duration: 0.314s ------------------------------------------ LightGBM --> roc_auc: 0.879 atom.plot_feature_importance(show=10) # We can check the feature importance with other plots as well atom.plot_permutation_importance(show=10) atom.dependence_plot()","title":"Feature engineering"},{"location":"examples/feature_engineering/feature_engineering/#feature-engineering","text":"This example shows how to use automated feature generation to improve your model's performance. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow .","title":"Feature engineering"},{"location":"examples/feature_engineering/feature_engineering/#load-the-data","text":"# Import packages import pandas as pd from atom import ATOMClassifier # Load data X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 36171 WaggaWagga 14.3 21.4 0.8 10.6 5.8 W 52.0 44425 Canberra 16.0 22.8 0.0 12.4 6.0 E 50.0 126238 Walpole 13.8 20.7 4.8 NaN NaN NW 33.0 54550 Ballarat 3.3 14.7 0.0 NaN NaN N 46.0 85638 Cairns 23.5 31.5 43.8 0.8 8.5 SSE 52.0","title":"Load the data"},{"location":"examples/feature_engineering/feature_engineering/#run-the-pipeline","text":"# Initiate ATOM and apply data cleaning atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0, random_state=1) atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) atom.encode(max_onehot=10, frac_to_other=0.04) # Let's see how a LightGBM model performs without adding additional features atom.run('LGB', metric='auc') atom.scoring() is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead Results ===================== >> LightGBM --> roc_auc: 0.878 # What are the most important fetaures? atom.plot_feature_importance(show=10) Now let's create some new fetaures using Deep Feature Synthesis atom.verbose = 2 # Increase verbosity to see the output # Create 100 new features using DFS atom.feature_generation(strategy='dfs', n_features=100, operators=['add', 'sub', 'log', 'sqrt']) Fitting FeatureGenerator... Creating new features... --> 100 new features were added to the dataset. divide by zero encountered in log invalid value encountered in log # The warnings warn us that some operators created missing values! # We can see the columns with missing values using the missing attribute atom.missing # We can easily turn off warnings in the future atom.warnings = False # We can use the impute method again atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) Fitting Imputer... Imputing missing values... --> Imputing 577 missing values using the KNN imputer in feature LOG(Cloud9am). --> Dropping feature LOG(RainToday_other) for containing 8873 (99%) missing values. --> Imputing 148 missing values using the KNN imputer in feature LOG(Sunshine). --> Imputing 6 missing values using the KNN imputer in feature LOG(Temp9am). --> Imputing 33 missing values using the KNN imputer in feature LOG(WindSpeed3pm). # 100 new features may be to much... # Let's check for multicollinearity and use RFECV to reduce the number even further atom.feature_selection(strategy='RFECV', solver='lgb', n_features=30, scoring='auc', max_correlation=0.98) Fitting FeatureSelector... Performing feature selection ... --> Feature Location was removed due to low variance. Value 0.2077375946173255 repeated in 100% of the rows. --> Feature Cloud3pm + Humidity3pm was removed due to collinearity with another feature. --> Feature Cloud3pm + RainToday_No was removed due to collinearity with another feature. --> Feature Cloud3pm + WindDir9am was removed due to collinearity with another feature. --> Feature Cloud3pm - Location was removed due to collinearity with another feature. --> Feature Cloud3pm - RainToday_No was removed due to collinearity with another feature. --> Feature Cloud9am + WindGustDir was removed due to collinearity with another feature. --> Feature Evaporation + Location was removed due to collinearity with another feature. --> Feature Evaporation + WindGustDir was removed due to collinearity with another feature. --> Feature Evaporation - WindDir3pm was removed due to collinearity with another feature. --> Feature Humidity3pm - RainToday_No was removed due to collinearity with another feature. --> Feature Humidity3pm - Sunshine was removed due to collinearity with another feature. --> Feature Humidity9am + RainToday_Yes was removed due to collinearity with another feature. --> Feature Humidity9am - RainToday_No was removed due to collinearity with another feature. --> Feature Humidity9am - Sunshine was removed due to collinearity with another feature. --> Feature LOG(MaxTemp) was removed due to collinearity with another feature. --> Feature Location + MinTemp was removed due to collinearity with another feature. --> Feature Location + RainToday_No was removed due to collinearity with another feature. --> Feature Location + WindDir3pm was removed due to collinearity with another feature. --> Feature Location + WindGustDir was removed due to collinearity with another feature. --> Feature Location + WindSpeed3pm was removed due to collinearity with another feature. --> Feature Location - RainToday_Yes was removed due to collinearity with another feature. --> Feature MaxTemp + RainToday_No was removed due to collinearity with another feature. --> Feature MaxTemp + RainToday_Yes was removed due to collinearity with another feature. --> Feature MinTemp + WindGustDir was removed due to collinearity with another feature. --> Feature Pressure3pm + RainToday_other was removed due to collinearity with another feature. --> Feature Pressure3pm + Temp3pm was removed due to collinearity with another feature. --> Feature Pressure3pm - WindGustDir was removed due to collinearity with another feature. --> Feature Pressure9am - WindGustDir was removed due to collinearity with another feature. --> Feature RainToday_No + Temp9am was removed due to collinearity with another feature. --> Feature RainToday_No + WindGustDir was removed due to collinearity with another feature. --> Feature RainToday_No - WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_Yes + Temp9am was removed due to collinearity with another feature. --> Feature RainToday_Yes + WindDir3pm was removed due to collinearity with another feature. --> Feature RainToday_Yes + WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_Yes - WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_other - Temp9am was removed due to collinearity with another feature. --> Feature RainToday_other - WindGustSpeed was removed due to collinearity with another feature. --> Feature RainToday_other - WindSpeed9am was removed due to collinearity with another feature. --> Feature Rainfall + RainToday_No was removed due to collinearity with another feature. --> Feature Rainfall + WindDir9am was removed due to collinearity with another feature. --> Feature Rainfall - WindDir3pm was removed due to collinearity with another feature. --> Feature SQRT(Humidity3pm) was removed due to collinearity with another feature. --> Feature SQRT(Pressure9am) was removed due to collinearity with another feature. --> Feature Sunshine + WindDir9am was removed due to collinearity with another feature. --> Feature Temp3pm + WindDir9am was removed due to collinearity with another feature. --> Feature Temp3pm + WindGustDir was removed due to collinearity with another feature. --> Feature Temp3pm - WindDir3pm was removed due to collinearity with another feature. --> Feature Temp9am - WindDir9am was removed due to collinearity with another feature. --> Feature WindDir3pm - WindSpeed3pm was removed due to collinearity with another feature. --> Feature WindGustDir + WindGustSpeed was removed due to collinearity with another feature. --> Feature WindGustDir - WindSpeed9am was removed due to collinearity with another feature. --> The RFECV selected 64 features from the dataset. >>> Dropping feature RainToday_Yes (rank 3). >>> Dropping feature RainToday_No (rank 5). >>> Dropping feature Location - WindSpeed9am (rank 2). >>> Dropping feature SQRT(Cloud9am) (rank 7). >>> Dropping feature SQRT(Rainfall) (rank 6). >>> Dropping feature SQRT(WindSpeed9am) (rank 4). # The collinear attribute shows what features were removed due to multicollinearity atom.collinear .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } drop_feature correlated_feature correlation_value 0 Cloud3pm + Humidity3pm Humidity3pm 0.99578 1 Cloud3pm + RainToday_No Cloud3pm 0.98122 2 Cloud3pm + WindDir9am Cloud3pm, Cloud3pm + RainToday_No 0.99968, 0.98054 3 Cloud3pm - Location Cloud3pm, Cloud3pm + RainToday_No, Cloud3pm + ... 1.0, 0.98122, 0.99968 4 Cloud3pm - RainToday_No Cloud3pm, Cloud3pm + WindDir9am, Cloud3pm - Lo... 0.98405, 0.98408, 0.98405 5 Cloud9am + WindGustDir Cloud9am 0.99979 6 Evaporation + Location Evaporation 1.0 7 Evaporation + WindGustDir Evaporation, Evaporation + Location 0.9999, 0.9999 8 Evaporation - WindDir3pm Evaporation, Evaporation + Location, Evaporati... 0.9999, 0.9999, 0.99969 9 Humidity3pm - RainToday_No Humidity3pm, Cloud3pm + Humidity3pm 0.99983, 0.99572 10 Humidity3pm - Sunshine Humidity3pm, Cloud3pm + Humidity3pm, Humidity3... 0.99347, 0.99405, 0.9935 11 Humidity9am + RainToday_Yes Humidity9am 0.9998 12 Humidity9am - RainToday_No Humidity9am, Humidity9am + RainToday_Yes 0.9998, 0.99999 13 Humidity9am - Sunshine Humidity9am, Humidity9am + RainToday_Yes, Humi... 0.99165, 0.99183, 0.99184 14 LOG(MaxTemp) MaxTemp 0.98395 15 Location + MinTemp MinTemp 1.0 16 Location + RainToday_No RainToday_Yes, RainToday_No -0.98403, 1.0 17 Location + WindDir3pm WindDir3pm 1.0 18 Location + WindGustDir WindGustDir 1.0 19 Location + WindSpeed3pm WindSpeed3pm 1.0 20 Location - RainToday_Yes RainToday_Yes, RainToday_No, Location + RainTo... -1.0, 0.98403, 0.98403 21 MaxTemp + RainToday_No MaxTemp, LOG(MaxTemp) 0.99841, 0.9831 22 MaxTemp + RainToday_Yes MaxTemp, LOG(MaxTemp), MaxTemp + RainToday_No 0.99834, 0.98156, 0.99356 23 MinTemp + WindGustDir MinTemp, Location + MinTemp 0.99997, 0.99997 24 Pressure3pm + RainToday_other Pressure3pm 0.99995 25 Pressure3pm + Temp3pm MaxTemp + Pressure3pm 0.98005 26 Pressure3pm - WindGustDir Pressure3pm, Pressure3pm + RainToday_other 0.99998, 0.99992 27 Pressure9am - WindGustDir Pressure9am 0.99998 28 RainToday_No + Temp9am Temp9am 0.99797 29 RainToday_No + WindGustDir RainToday_No, Location + RainToday_No 0.9933, 0.9933 30 RainToday_No - WindDir9am RainToday_No, Location + RainToday_No 0.99169, 0.99169 31 RainToday_Yes + Temp9am Temp9am, RainToday_No + Temp9am, RainToday_No ... 0.99795, 0.99191, -0.99993 32 RainToday_Yes + WindDir3pm RainToday_Yes, Location - RainToday_Yes 0.99334, -0.99334 33 RainToday_Yes + WindDir9am RainToday_Yes, Location - RainToday_Yes, RainT... 0.99154, -0.99154, -0.9847, 0.98993 34 RainToday_Yes - WindDir9am RainToday_Yes, Location - RainToday_Yes 0.9911, -0.9911 35 RainToday_other - Temp9am Temp9am, RainToday_No + Temp9am, RainToday_No ... -0.99993, -0.998, 0.99775, -0.99792 36 RainToday_other - WindGustSpeed WindGustSpeed, Cloud9am - WindGustSpeed -0.99998, 0.98438 37 RainToday_other - WindSpeed9am WindSpeed9am, Location - WindSpeed9am -0.99997, 0.99997 38 Rainfall + RainToday_No Rainfall 0.99907 39 Rainfall + WindDir9am Rainfall, Rainfall + RainToday_No 0.99998, 0.99902 40 Rainfall - WindDir3pm Rainfall, Rainfall + RainToday_No, Rainfall + ... 0.99998, 0.99907, 0.99995 41 SQRT(Humidity3pm) Humidity3pm, Cloud3pm + Humidity3pm, Humidity3... 0.98722, 0.98193, 0.98674 42 SQRT(Pressure9am) Pressure9am, Pressure9am - WindGustDir 1.0, 0.99998 43 Sunshine + WindDir9am Sunshine, RainToday_other - Sunshine 0.99982, -0.99948 44 Temp3pm + WindDir9am Temp3pm 0.99997 45 Temp3pm + WindGustDir Temp3pm, Temp3pm + WindDir9am 0.99998, 0.99997 46 Temp3pm - WindDir3pm Temp3pm, Temp3pm + WindDir9am, Temp3pm + WindG... 0.99998, 0.99993, 0.99993 47 Temp9am - WindDir9am Temp9am, RainToday_No + Temp9am, RainToday_No ... 0.99996, 0.99798, -0.99783, 0.99787, -0.9999 48 WindDir3pm - WindSpeed3pm WindSpeed3pm, Location + WindSpeed3pm, Locatio... -0.99998, -0.99998, 0.99998 49 WindGustDir + WindGustSpeed WindGustSpeed, Cloud9am - WindGustSpeed, RainT... 0.99999, -0.9843, -0.99998 50 WindGustDir - WindSpeed9am WindSpeed9am, Location - WindSpeed9am, RainTod... -0.99999, 0.99999, 0.99995 # After applying RFECV, we can plot the score per number of features atom.plot_rfecv() # Let's see how the model performs now atom.run('LGB') Running pipeline ============================= >> Models in pipeline: LGB Metric: roc_auc Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> roc_auc: 0.9962 Score on the test set --> roc_auc: 0.8787 Time elapsed: 0.708s ------------------------------------------------- Total time: 0.722s Final results ========================= >> Duration: 0.723s ------------------------------------------ LightGBM --> roc_auc: 0.879 # Did the feature importance change? atom.plot_feature_importance(show=10) Lets try the same using Genetic Feature Generation atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0, warnings=False, random_state=1) atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) atom.encode(max_onehot=10, frac_to_other=0.04) # Change verbosity to print extended info atom.verbose = 2 # Create new features using Genetic Programming atom.feature_generation(strategy='genetic', n_features=20, generations=10, population=2000) Fitting FeatureGenerator... | Population Average | Best Individual | ---- ------------------------- ------------------------------------------ ---------- Gen Length Fitness Length Fitness OOB Fitness Time Left 0 3.17 0.127531 3 0.50405 N/A 9.52s 1 3.10 0.338627 5 0.536586 N/A 9.04s 2 3.50 0.443734 9 0.541692 N/A 7.65s 3 4.44 0.47684 7 0.54494 N/A 6.89s 4 6.25 0.512037 13 0.546193 N/A 5.76s 5 7.47 0.507736 9 0.550266 N/A 4.62s 6 7.73 0.500405 11 0.55324 N/A 3.56s 7 7.99 0.497944 11 0.553398 N/A 2.38s 8 9.29 0.494223 13 0.554965 N/A 1.29s 9 10.68 0.493684 11 0.553398 N/A 0.00s Creating new features... --> 5 new features were added to the dataset. # We can see the feature's fitness and description through the genetic_features attribute atom.genetic_features .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name description fitness 0 Feature 24 mul(sub(sub(sub(Humidity3pm, Sunshine), Sunshi... 0.542398 1 Feature 25 mul(sub(sub(Humidity3pm, Sunshine), Sunshine),... 0.542240 2 Feature 26 mul(sub(Humidity3pm, Sunshine), mul(sub(sub(Hu... 0.542240 3 Feature 27 mul(mul(sub(Humidity3pm, Sunshine), WindGustSp... 0.542240 4 Feature 28 mul(mul(sub(sub(Humidity3pm, Sunshine), Sunshi... 0.542240 # And fit the model again atom.run('LGB', metric='auc') Running pipeline ============================= >> Models in pipeline: LGB Metric: roc_auc Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> roc_auc: 0.9901 Score on the test set --> roc_auc: 0.8793 Time elapsed: 0.305s ------------------------------------------------- Total time: 0.313s Final results ========================= >> Duration: 0.314s ------------------------------------------ LightGBM --> roc_auc: 0.879 atom.plot_feature_importance(show=10) # We can check the feature importance with other plots as well atom.plot_permutation_importance(show=10) atom.dependence_plot()","title":"Run the pipeline"},{"location":"examples/multi_metric/multi_metric/","text":"Multi-metric This example shows how we can evaluate an ATOM pipeline on multiple metrics. Import the breast cancer dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not. Load the data # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier # Get the dataset's features and targets X, y = load_breast_cancer(return_X_y=True) Run the pipeline # Call ATOM and run the pipeline using multipe metrics # Note that for every step of the BO, both metrics are calculated, but only the first is used for optimization! atom = ATOMClassifier(X, y, n_jobs=2, verbose=2, warnings=False, random_state=1) atom.run(['MNB', 'QDA'], metric=('f1', 'recall'), n_calls=3, n_initial_points=1, bagging=4) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 2 cores. Applying data cleaning... Dataset stats ================= >> Shape: (569, 31) Scaled: False ---------------------------------- Train set size: 456 Test set size: 113 ---------------------------------- Train set balance: 0:1 <==> 0.6:1.0 Test set balance: 0:1 <==> 0.7:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 212 | 167 | 45 | | 1 | 357 | 289 | 68 | Running pipeline ============================= >> Models in pipeline: MNB, QDA Metric: f1, recall Running BO for Multinomial Naive Bayes... Random start 1 ---------------------------------- Parameters --> {'alpha': 1, 'fit_prior': True} Evaluation --> f1: 0.9260 Best f1: 0.9260 recall: 0.9722 Best recall: 0.9722 Time iteration: 3.108s Total time: 3.124s Iteration 2 ------------------------------------- Parameters --> {'alpha': 9.744, 'fit_prior': True} Evaluation --> f1: 0.9225 Best f1: 0.9260 recall: 0.9688 Best recall: 0.9722 Time iteration: 0.048s Total time: 3.172s Iteration 3 ------------------------------------- Parameters --> {'alpha': 0.66, 'fit_prior': False} Evaluation --> f1: 0.9223 Best f1: 0.9260 recall: 0.9655 Best recall: 0.9722 Time iteration: 0.044s Total time: 3.357s Results for Multinomial Naive Bayes: Bayesian Optimization --------------------------- Best parameters --> {'alpha': 1, 'fit_prior': True} Best evaluation --> f1: 0.9260 recall: 0.9722 Time elapsed: 3.494s Fitting ----------------------------------------- Score on the train set --> f1: 0.9243 recall: 0.9723 Score on the test set --> f1: 0.9103 recall: 0.9706 Time elapsed: 0.004s Bagging ----------------------------------------- Score --> f1: 0.9100 \u00b1 0.0005 recall: 0.9669 \u00b1 0.0064 Time elapsed: 0.031s ------------------------------------------------- Total time: 3.531s Running BO for Quadratic Discriminant Analysis... Random start 1 ---------------------------------- Parameters --> {'reg_param': 0} Evaluation --> f1: 0.9654 Best f1: 0.9654 recall: 0.9619 Best recall: 0.9619 Time iteration: 0.031s Total time: 0.031s Iteration 2 ------------------------------------- Parameters --> {'reg_param': 1.0} Evaluation --> f1: 0.9245 Best f1: 0.9654 recall: 0.9897 Best recall: 0.9897 Time iteration: 0.031s Total time: 0.063s Iteration 3 ------------------------------------- Parameters --> {'reg_param': 0.0} Evaluation --> f1: 0.9633 Best f1: 0.9654 recall: 0.9549 Best recall: 0.9897 Time iteration: 0.031s Total time: 0.188s Results for Quadratic Discriminant Analysis: Bayesian Optimization --------------------------- Best parameters --> {'reg_param': 0} Best evaluation --> f1: 0.9654 recall: 0.9619 Time elapsed: 0.297s Fitting ----------------------------------------- Score on the train set --> f1: 0.9828 recall: 0.9896 Score on the test set --> f1: 0.9710 recall: 0.9853 Time elapsed: 0.016s Bagging ----------------------------------------- Score --> f1: 0.9606 \u00b1 0.0081 recall: 0.9853 \u00b1 0.0104 Time elapsed: 0.031s ------------------------------------------------- Total time: 0.344s Final results ========================= >> Duration: 3.875s ------------------------------------------ Multinomial Naive Bayes --> f1: 0.910 \u00b1 0.001 recall: 0.967 \u00b1 0.006 Quadratic Discriminant Analysis --> f1: 0.961 \u00b1 0.008 recall: 0.985 \u00b1 0.010 ! Analyze the results # Note that some columns in the results dataframe now contain a list of scores, # one for each metric, in the same order as you called them atom.results[['metric_bo', 'metric_train', 'metric_test']] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } metric_bo metric_train metric_test model MNB [0.9259597646215939, 0.9722323049001815] [0.924342105263158, 0.972318339100346] [0.9103448275862068, 0.9705882352941176] QDA [0.965402611638704, 0.9618874773139746] [0.9828178694158075, 0.9896193771626297] [0.9710144927536232, 0.9852941176470589] # Some plots allow us to choose the metric we want to show atom.plot_bagging(metric='recall')","title":"Multi-metric"},{"location":"examples/multi_metric/multi_metric/#multi-metric","text":"This example shows how we can evaluate an ATOM pipeline on multiple metrics. Import the breast cancer dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.","title":"Multi-metric"},{"location":"examples/multi_metric/multi_metric/#load-the-data","text":"# Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier # Get the dataset's features and targets X, y = load_breast_cancer(return_X_y=True)","title":"Load the data"},{"location":"examples/multi_metric/multi_metric/#run-the-pipeline","text":"# Call ATOM and run the pipeline using multipe metrics # Note that for every step of the BO, both metrics are calculated, but only the first is used for optimization! atom = ATOMClassifier(X, y, n_jobs=2, verbose=2, warnings=False, random_state=1) atom.run(['MNB', 'QDA'], metric=('f1', 'recall'), n_calls=3, n_initial_points=1, bagging=4) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 2 cores. Applying data cleaning... Dataset stats ================= >> Shape: (569, 31) Scaled: False ---------------------------------- Train set size: 456 Test set size: 113 ---------------------------------- Train set balance: 0:1 <==> 0.6:1.0 Test set balance: 0:1 <==> 0.7:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 212 | 167 | 45 | | 1 | 357 | 289 | 68 | Running pipeline ============================= >> Models in pipeline: MNB, QDA Metric: f1, recall Running BO for Multinomial Naive Bayes... Random start 1 ---------------------------------- Parameters --> {'alpha': 1, 'fit_prior': True} Evaluation --> f1: 0.9260 Best f1: 0.9260 recall: 0.9722 Best recall: 0.9722 Time iteration: 3.108s Total time: 3.124s Iteration 2 ------------------------------------- Parameters --> {'alpha': 9.744, 'fit_prior': True} Evaluation --> f1: 0.9225 Best f1: 0.9260 recall: 0.9688 Best recall: 0.9722 Time iteration: 0.048s Total time: 3.172s Iteration 3 ------------------------------------- Parameters --> {'alpha': 0.66, 'fit_prior': False} Evaluation --> f1: 0.9223 Best f1: 0.9260 recall: 0.9655 Best recall: 0.9722 Time iteration: 0.044s Total time: 3.357s Results for Multinomial Naive Bayes: Bayesian Optimization --------------------------- Best parameters --> {'alpha': 1, 'fit_prior': True} Best evaluation --> f1: 0.9260 recall: 0.9722 Time elapsed: 3.494s Fitting ----------------------------------------- Score on the train set --> f1: 0.9243 recall: 0.9723 Score on the test set --> f1: 0.9103 recall: 0.9706 Time elapsed: 0.004s Bagging ----------------------------------------- Score --> f1: 0.9100 \u00b1 0.0005 recall: 0.9669 \u00b1 0.0064 Time elapsed: 0.031s ------------------------------------------------- Total time: 3.531s Running BO for Quadratic Discriminant Analysis... Random start 1 ---------------------------------- Parameters --> {'reg_param': 0} Evaluation --> f1: 0.9654 Best f1: 0.9654 recall: 0.9619 Best recall: 0.9619 Time iteration: 0.031s Total time: 0.031s Iteration 2 ------------------------------------- Parameters --> {'reg_param': 1.0} Evaluation --> f1: 0.9245 Best f1: 0.9654 recall: 0.9897 Best recall: 0.9897 Time iteration: 0.031s Total time: 0.063s Iteration 3 ------------------------------------- Parameters --> {'reg_param': 0.0} Evaluation --> f1: 0.9633 Best f1: 0.9654 recall: 0.9549 Best recall: 0.9897 Time iteration: 0.031s Total time: 0.188s Results for Quadratic Discriminant Analysis: Bayesian Optimization --------------------------- Best parameters --> {'reg_param': 0} Best evaluation --> f1: 0.9654 recall: 0.9619 Time elapsed: 0.297s Fitting ----------------------------------------- Score on the train set --> f1: 0.9828 recall: 0.9896 Score on the test set --> f1: 0.9710 recall: 0.9853 Time elapsed: 0.016s Bagging ----------------------------------------- Score --> f1: 0.9606 \u00b1 0.0081 recall: 0.9853 \u00b1 0.0104 Time elapsed: 0.031s ------------------------------------------------- Total time: 0.344s Final results ========================= >> Duration: 3.875s ------------------------------------------ Multinomial Naive Bayes --> f1: 0.910 \u00b1 0.001 recall: 0.967 \u00b1 0.006 Quadratic Discriminant Analysis --> f1: 0.961 \u00b1 0.008 recall: 0.985 \u00b1 0.010 !","title":"Run the pipeline"},{"location":"examples/multi_metric/multi_metric/#analyze-the-results","text":"# Note that some columns in the results dataframe now contain a list of scores, # one for each metric, in the same order as you called them atom.results[['metric_bo', 'metric_train', 'metric_test']] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } metric_bo metric_train metric_test model MNB [0.9259597646215939, 0.9722323049001815] [0.924342105263158, 0.972318339100346] [0.9103448275862068, 0.9705882352941176] QDA [0.965402611638704, 0.9618874773139746] [0.9828178694158075, 0.9896193771626297] [0.9710144927536232, 0.9852941176470589] # Some plots allow us to choose the metric we want to show atom.plot_bagging(metric='recall')","title":"Analyze the results"},{"location":"examples/multiclass_classification/multiclass_classification/","text":"Multiclass classification This example shows how to compare the performance of three models on a multiclass classification task. Import the wine dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict wines into three groups (which cultivator it's from) using features based on the results of chemical analysis. Load the data # Import packages from sklearn.datasets import load_wine from atom import ATOMClassifier # Load the dataset's features and targets X, y = load_wine(return_X_y=True, as_frame=True) # Let's have a look at a subsample of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols 101 12.60 1.34 1.90 18.5 88.0 1.45 1.36 0.29 133 12.70 3.55 2.36 21.5 106.0 1.70 1.20 0.17 86 12.16 1.61 2.31 22.8 90.0 1.78 1.69 0.43 93 12.29 2.83 2.22 18.0 88.0 2.45 2.25 0.25 92 12.69 1.53 2.26 20.7 80.0 1.38 1.46 0.58 Run the pipeline atom = ATOMClassifier(X, y, n_jobs=-1, warnings='ignore', verbose=2, random_state=1) # Fit the pipeline with the selected models atom.run(models=['LR','LDA', 'RF'], metric='roc_auc_ovr', n_calls=4, n_initial_points=3, bo_params={'base_estimator': 'rf', 'max_time': 100}, bagging=5) << ================== ATOM ================== >> Algorithm task: multiclass classification. Parallel processing with 16 cores. Applying data cleaning... Dataset stats ================= >> Shape: (178, 14) Scaled: False ---------------------------------- Train set size: 143 Test set size: 35 ---------------------------------- Train set balance: 0:1:2 <==> 1.4:1.7:1.0 Test set balance: 0:1:2 <==> 0.7:1.0:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 59 | 50 | 9 | | 1 | 71 | 58 | 13 | | 2 | 48 | 35 | 13 | Running pipeline ============================= >> Models in pipeline: LR, LDA, RF Metric: roc_auc_ovr Running BO for Logistic Regression... Random start 1 ---------------------------------- Parameters --> {'max_iter': 335, 'solver': 'sag', 'penalty': 'l2', 'C': 0.001} Evaluation --> roc_auc_ovr: 0.9970 Best roc_auc_ovr: 0.9970 Time iteration: 3.971s Total time: 3.975s Random start 2 ---------------------------------- Parameters --> {'max_iter': 244, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.087} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 3.769s Total time: 7.748s Random start 3 ---------------------------------- Parameters --> {'max_iter': 376, 'solver': 'liblinear', 'penalty': 'l2', 'C': 2.667} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 3.589s Total time: 11.342s Iteration 4 ------------------------------------- Parameters --> {'max_iter': 498, 'solver': 'sag', 'penalty': 'l2', 'C': 0.882} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 4.328s Total time: 15.920s Results for Logistic Regression: Bayesian Optimization --------------------------- Best parameters --> {'max_iter': 244, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.087} Best evaluation --> roc_auc_ovr: 1.0000 Time elapsed: 16.151s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 1.0000 Score on the test set --> roc_auc_ovr: 0.9988 Time elapsed: 0.020s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9991 \u00b1 0.0009 Time elapsed: 0.072s ------------------------------------------------- Total time: 16.249s Running BO for Linear Discriminant Analysis... Random start 1 ---------------------------------- Parameters --> {'solver': 'eigen', 'shrinkage': 1.0} Evaluation --> roc_auc_ovr: 0.8975 Best roc_auc_ovr: 0.8975 Time iteration: 0.021s Total time: 0.022s Random start 2 ---------------------------------- Parameters --> {'solver': 'svd'} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 0.021s Total time: 0.047s Random start 3 ---------------------------------- Parameters --> {'solver': 'svd'} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 0.018s Total time: 0.068s Iteration 4 ------------------------------------- Parameters --> {'solver': 'lsqr', 'shrinkage': 0.7} Evaluation --> roc_auc_ovr: 0.8996 Best roc_auc_ovr: 1.0000 Time iteration: 0.020s Total time: 0.279s Results for Linear Discriminant Analysis: Bayesian Optimization --------------------------- Best parameters --> {'solver': 'svd'} Best evaluation --> roc_auc_ovr: 1.0000 Time elapsed: 0.474s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 1.0000 Score on the test set --> roc_auc_ovr: 1.0000 Time elapsed: 0.010s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9998 \u00b1 0.0005 Time elapsed: 0.024s ------------------------------------------------- Total time: 0.510s Running BO for Random Forest... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 245, 'max_depth': 7, 'max_features': 1.0, 'criterion': 'gini', 'min_samples_split': 7, 'min_samples_leaf': 16, 'ccp_alpha': 0.008, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> roc_auc_ovr: 0.9853 Best roc_auc_ovr: 0.9853 Time iteration: 0.412s Total time: 0.418s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 400, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'gini', 'min_samples_split': 20, 'min_samples_leaf': 12, 'ccp_alpha': 0.016, 'bootstrap': True, 'max_samples': 0.7} Evaluation --> roc_auc_ovr: 0.9937 Best roc_auc_ovr: 0.9937 Time iteration: 0.642s Total time: 1.063s Random start 3 ---------------------------------- Parameters --> {'n_estimators': 78, 'max_depth': 10, 'max_features': 0.7, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 14, 'ccp_alpha': 0.025, 'bootstrap': False} Evaluation --> roc_auc_ovr: 0.9865 Best roc_auc_ovr: 0.9937 Time iteration: 0.122s Total time: 1.190s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 323, 'max_depth': 7, 'max_features': 1.0, 'criterion': 'gini', 'min_samples_split': 16, 'min_samples_leaf': 1, 'ccp_alpha': 0.007, 'bootstrap': False} Evaluation --> roc_auc_ovr: 0.9315 Best roc_auc_ovr: 0.9937 Time iteration: 0.405s Total time: 1.823s Results for Random Forest: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 400, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'gini', 'min_samples_split': 20, 'min_samples_leaf': 12, 'ccp_alpha': 0.016, 'bootstrap': True, 'max_samples': 0.7} Best evaluation --> roc_auc_ovr: 0.9937 Time elapsed: 2.056s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 0.9997 Score on the test set --> roc_auc_ovr: 0.9825 Time elapsed: 0.588s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9737 \u00b1 0.0116 Time elapsed: 2.716s ------------------------------------------------- Total time: 5.363s Final results ========================= >> Duration: 22.125s ------------------------------------------ Logistic Regression --> roc_auc_ovr: 0.999 \u00b1 0.001 Linear Discriminant Analysis --> roc_auc_ovr: 1.000 \u00b1 0.000 ! Random Forest --> roc_auc_ovr: 0.974 \u00b1 0.012 Analyze the results # We can access the pipeline's results via the results attribute atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name metric_bo time_bo metric_train metric_test time_fit mean_bagging std_bagging time_bagging time model LR Logistic Regression 1.000000 16.151s 1.000000 0.998834 0.020s 0.999068 0.000872 0.072s 16.249s LDA Linear Discriminant Analysis 1.000000 0.474s 1.000000 1.000000 0.010s 0.999767 0.000466 0.024s 0.510s RF Random Forest 0.993712 2.056s 0.999725 0.982517 0.588s 0.973686 0.011577 2.716s 5.363s # Show the scoring for a different metric than the one we trained on atom.scoring('precision_macro') Results ===================== >> Logistic Regression --> precision_macro: 1.0 Linear Discriminant Analysis --> precision_macro: 0.976 Random Forest --> precision_macro: 0.9 Let's have a closer look at the Random Forest # Get the results on some other metrics print('Jaccard score:', atom.rf.scoring('jaccard_weighted')) print('Recall score:', atom.rf.scoring('recall_macro')) Jaccard score: 0.7957142857142857 Recall score: 0.8974358974358975 # Plot the confusion matrix atom.RF.plot_confusion_matrix(figsize=(9, 9)) # Save the estimator as a pickle file atom.RF.save_estimator('Random_Forest_model') Random Forest estimator saved successfully!","title":"Multiclass_classification"},{"location":"examples/multiclass_classification/multiclass_classification/#multiclass-classification","text":"This example shows how to compare the performance of three models on a multiclass classification task. Import the wine dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict wines into three groups (which cultivator it's from) using features based on the results of chemical analysis.","title":"Multiclass classification"},{"location":"examples/multiclass_classification/multiclass_classification/#load-the-data","text":"# Import packages from sklearn.datasets import load_wine from atom import ATOMClassifier # Load the dataset's features and targets X, y = load_wine(return_X_y=True, as_frame=True) # Let's have a look at a subsample of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols 101 12.60 1.34 1.90 18.5 88.0 1.45 1.36 0.29 133 12.70 3.55 2.36 21.5 106.0 1.70 1.20 0.17 86 12.16 1.61 2.31 22.8 90.0 1.78 1.69 0.43 93 12.29 2.83 2.22 18.0 88.0 2.45 2.25 0.25 92 12.69 1.53 2.26 20.7 80.0 1.38 1.46 0.58","title":"Load the data"},{"location":"examples/multiclass_classification/multiclass_classification/#run-the-pipeline","text":"atom = ATOMClassifier(X, y, n_jobs=-1, warnings='ignore', verbose=2, random_state=1) # Fit the pipeline with the selected models atom.run(models=['LR','LDA', 'RF'], metric='roc_auc_ovr', n_calls=4, n_initial_points=3, bo_params={'base_estimator': 'rf', 'max_time': 100}, bagging=5) << ================== ATOM ================== >> Algorithm task: multiclass classification. Parallel processing with 16 cores. Applying data cleaning... Dataset stats ================= >> Shape: (178, 14) Scaled: False ---------------------------------- Train set size: 143 Test set size: 35 ---------------------------------- Train set balance: 0:1:2 <==> 1.4:1.7:1.0 Test set balance: 0:1:2 <==> 0.7:1.0:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 59 | 50 | 9 | | 1 | 71 | 58 | 13 | | 2 | 48 | 35 | 13 | Running pipeline ============================= >> Models in pipeline: LR, LDA, RF Metric: roc_auc_ovr Running BO for Logistic Regression... Random start 1 ---------------------------------- Parameters --> {'max_iter': 335, 'solver': 'sag', 'penalty': 'l2', 'C': 0.001} Evaluation --> roc_auc_ovr: 0.9970 Best roc_auc_ovr: 0.9970 Time iteration: 3.971s Total time: 3.975s Random start 2 ---------------------------------- Parameters --> {'max_iter': 244, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.087} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 3.769s Total time: 7.748s Random start 3 ---------------------------------- Parameters --> {'max_iter': 376, 'solver': 'liblinear', 'penalty': 'l2', 'C': 2.667} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 3.589s Total time: 11.342s Iteration 4 ------------------------------------- Parameters --> {'max_iter': 498, 'solver': 'sag', 'penalty': 'l2', 'C': 0.882} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 4.328s Total time: 15.920s Results for Logistic Regression: Bayesian Optimization --------------------------- Best parameters --> {'max_iter': 244, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.087} Best evaluation --> roc_auc_ovr: 1.0000 Time elapsed: 16.151s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 1.0000 Score on the test set --> roc_auc_ovr: 0.9988 Time elapsed: 0.020s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9991 \u00b1 0.0009 Time elapsed: 0.072s ------------------------------------------------- Total time: 16.249s Running BO for Linear Discriminant Analysis... Random start 1 ---------------------------------- Parameters --> {'solver': 'eigen', 'shrinkage': 1.0} Evaluation --> roc_auc_ovr: 0.8975 Best roc_auc_ovr: 0.8975 Time iteration: 0.021s Total time: 0.022s Random start 2 ---------------------------------- Parameters --> {'solver': 'svd'} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 0.021s Total time: 0.047s Random start 3 ---------------------------------- Parameters --> {'solver': 'svd'} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 0.018s Total time: 0.068s Iteration 4 ------------------------------------- Parameters --> {'solver': 'lsqr', 'shrinkage': 0.7} Evaluation --> roc_auc_ovr: 0.8996 Best roc_auc_ovr: 1.0000 Time iteration: 0.020s Total time: 0.279s Results for Linear Discriminant Analysis: Bayesian Optimization --------------------------- Best parameters --> {'solver': 'svd'} Best evaluation --> roc_auc_ovr: 1.0000 Time elapsed: 0.474s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 1.0000 Score on the test set --> roc_auc_ovr: 1.0000 Time elapsed: 0.010s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9998 \u00b1 0.0005 Time elapsed: 0.024s ------------------------------------------------- Total time: 0.510s Running BO for Random Forest... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 245, 'max_depth': 7, 'max_features': 1.0, 'criterion': 'gini', 'min_samples_split': 7, 'min_samples_leaf': 16, 'ccp_alpha': 0.008, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> roc_auc_ovr: 0.9853 Best roc_auc_ovr: 0.9853 Time iteration: 0.412s Total time: 0.418s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 400, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'gini', 'min_samples_split': 20, 'min_samples_leaf': 12, 'ccp_alpha': 0.016, 'bootstrap': True, 'max_samples': 0.7} Evaluation --> roc_auc_ovr: 0.9937 Best roc_auc_ovr: 0.9937 Time iteration: 0.642s Total time: 1.063s Random start 3 ---------------------------------- Parameters --> {'n_estimators': 78, 'max_depth': 10, 'max_features': 0.7, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 14, 'ccp_alpha': 0.025, 'bootstrap': False} Evaluation --> roc_auc_ovr: 0.9865 Best roc_auc_ovr: 0.9937 Time iteration: 0.122s Total time: 1.190s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 323, 'max_depth': 7, 'max_features': 1.0, 'criterion': 'gini', 'min_samples_split': 16, 'min_samples_leaf': 1, 'ccp_alpha': 0.007, 'bootstrap': False} Evaluation --> roc_auc_ovr: 0.9315 Best roc_auc_ovr: 0.9937 Time iteration: 0.405s Total time: 1.823s Results for Random Forest: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 400, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'gini', 'min_samples_split': 20, 'min_samples_leaf': 12, 'ccp_alpha': 0.016, 'bootstrap': True, 'max_samples': 0.7} Best evaluation --> roc_auc_ovr: 0.9937 Time elapsed: 2.056s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 0.9997 Score on the test set --> roc_auc_ovr: 0.9825 Time elapsed: 0.588s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9737 \u00b1 0.0116 Time elapsed: 2.716s ------------------------------------------------- Total time: 5.363s Final results ========================= >> Duration: 22.125s ------------------------------------------ Logistic Regression --> roc_auc_ovr: 0.999 \u00b1 0.001 Linear Discriminant Analysis --> roc_auc_ovr: 1.000 \u00b1 0.000 ! Random Forest --> roc_auc_ovr: 0.974 \u00b1 0.012","title":"Run the pipeline"},{"location":"examples/multiclass_classification/multiclass_classification/#analyze-the-results","text":"# We can access the pipeline's results via the results attribute atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name metric_bo time_bo metric_train metric_test time_fit mean_bagging std_bagging time_bagging time model LR Logistic Regression 1.000000 16.151s 1.000000 0.998834 0.020s 0.999068 0.000872 0.072s 16.249s LDA Linear Discriminant Analysis 1.000000 0.474s 1.000000 1.000000 0.010s 0.999767 0.000466 0.024s 0.510s RF Random Forest 0.993712 2.056s 0.999725 0.982517 0.588s 0.973686 0.011577 2.716s 5.363s # Show the scoring for a different metric than the one we trained on atom.scoring('precision_macro') Results ===================== >> Logistic Regression --> precision_macro: 1.0 Linear Discriminant Analysis --> precision_macro: 0.976 Random Forest --> precision_macro: 0.9 Let's have a closer look at the Random Forest # Get the results on some other metrics print('Jaccard score:', atom.rf.scoring('jaccard_weighted')) print('Recall score:', atom.rf.scoring('recall_macro')) Jaccard score: 0.7957142857142857 Recall score: 0.8974358974358975 # Plot the confusion matrix atom.RF.plot_confusion_matrix(figsize=(9, 9)) # Save the estimator as a pickle file atom.RF.save_estimator('Random_Forest_model') Random Forest estimator saved successfully!","title":"Analyze the results"},{"location":"examples/regression/regression/","text":"Regression This example shows how to use ATOM to apply PCA on the data and run a regression pipeline. Download the abalone dataset from https://archive.ics.uci.edu/ml/datasets/Abalone . The goal of this dataset is to predict the rings (age) of abalone shells from physical measurements. Load the data # Import packages import pandas as pd from atom import ATOMRegressor # Load the abalone dataset X = pd.read_csv('./datasets/abalone.csv') # Let's have a look at the data X.head() .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 # Initialize ATOM for regression tasks and encode the categorical features atom = ATOMRegressor(X, y=\"Rings\", verbose=2, random_state=42) atom.encode() << ================== ATOM ================== >> Algorithm task: regression. Applying data cleaning... Dataset stats ================= >> Shape: (4177, 9) Categorical columns: 1 Scaled: False ---------------------------------- Train set size: 3342 Test set size: 835 Fitting Encoder... Encoding categorical columns... --> OneHot-encoding feature Sex. Contains 3 unique categories. is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead # Plot the dataset's correlation matrix atom.plot_correlation() # Apply PCA for dimensionality reduction atom.feature_selection(strategy=\"pca\", n_features=6) Fitting FeatureSelector... Performing feature selection ... --> Applying Principal Component Analysis... >>> Scaling features... >>> Total explained variance: 0.976 # Use the plotting methods to see the retained variance ratio atom.plot_pca() atom.plot_components(figsize=(8, 6), filename='atom_PCA_plot') Run the pipeline atom.run(['Tree', 'Bag', 'ET'], metric='MSE', n_calls=5, n_initial_points=2, bo_params={'base_estimator': 'GBRT', 'cv': 1}, bagging=5) Running pipeline ============================= >> Models in pipeline: Tree, Bag, ET Metric: neg_mean_squared_error Running BO for Decision Tree... Random start 1 ---------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'random', 'max_depth': 5, 'max_features': 0.9, 'min_samples_split': 8, 'min_samples_leaf': 19, 'ccp_alpha': 0.003} Evaluation --> neg_mean_squared_error: -7.8759 Best neg_mean_squared_error: -7.8759 Time iteration: 0.043s Total time: 0.048s Random start 2 ---------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'best', 'max_depth': 10, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.033} Evaluation --> neg_mean_squared_error: -9.1854 Best neg_mean_squared_error: -7.8759 Time iteration: 0.181s Total time: 0.233s Iteration 3 ------------------------------------- Parameters --> {'criterion': 'friedman_mse', 'splitter': 'random', 'max_depth': 7, 'max_features': 0.6, 'min_samples_split': 17, 'min_samples_leaf': 19, 'ccp_alpha': 0.015} Evaluation --> neg_mean_squared_error: -8.2130 Best neg_mean_squared_error: -7.8759 Time iteration: 0.007s Total time: 0.428s Iteration 4 ------------------------------------- Parameters --> {'criterion': 'friedman_mse', 'splitter': 'best', 'max_depth': 4, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.006} Evaluation --> neg_mean_squared_error: -6.7540 Best neg_mean_squared_error: -6.7540 Time iteration: 0.010s Total time: 0.533s Iteration 5 ------------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'best', 'max_depth': 3, 'max_features': 0.9, 'min_samples_split': 7, 'min_samples_leaf': 6, 'ccp_alpha': 0.007} Evaluation --> neg_mean_squared_error: -7.2855 Best neg_mean_squared_error: -6.7540 Time iteration: 0.132s Total time: 0.757s Results for Decision Tree: Bayesian Optimization --------------------------- Best parameters --> {'criterion': 'friedman_mse', 'splitter': 'best', 'max_depth': 4, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.006} Best evaluation --> neg_mean_squared_error: -6.7540 Time elapsed: 0.855s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -6.3636 Score on the test set --> neg_mean_squared_error: -5.4433 Time elapsed: 0.011s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -5.5541 \u00b1 0.1150 Time elapsed: 0.039s ------------------------------------------------- Total time: 0.910s Running BO for Bagging Regressor... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 112, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -5.7680 Best neg_mean_squared_error: -5.7680 Time iteration: 0.877s Total time: 0.881s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 131, 'max_samples': 0.5, 'max_features': 0.5, 'bootstrap': False, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -6.8254 Best neg_mean_squared_error: -5.7680 Time iteration: 0.585s Total time: 1.471s Iteration 3 ------------------------------------- Parameters --> {'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': True} Evaluation --> neg_mean_squared_error: -5.4895 Best neg_mean_squared_error: -5.4895 Time iteration: 0.389s Total time: 1.953s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 74, 'max_samples': 0.5, 'max_features': 0.5, 'bootstrap': False, 'bootstrap_features': True} Evaluation --> neg_mean_squared_error: -6.0363 Best neg_mean_squared_error: -5.4895 Time iteration: 0.330s Total time: 2.381s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 36, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': True, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -6.0037 Best neg_mean_squared_error: -5.4895 Time iteration: 0.194s Total time: 2.668s Results for Bagging Regressor: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': True} Best evaluation --> neg_mean_squared_error: -5.4895 Time elapsed: 2.764s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -0.0867 Score on the test set --> neg_mean_squared_error: -4.9533 Time elapsed: 0.571s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -5.2363 \u00b1 0.1099 Time elapsed: 2.325s ------------------------------------------------- Total time: 5.662s Running BO for Extra-Trees... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 112, 'max_depth': 6, 'max_features': 1.0, 'criterion': 'mae', 'min_samples_split': 8, 'min_samples_leaf': 19, 'ccp_alpha': 0.003, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.1995 Best neg_mean_squared_error: -7.1995 Time iteration: 1.034s Total time: 1.040s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 369, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 13, 'min_samples_leaf': 6, 'ccp_alpha': 0.0, 'bootstrap': False} Evaluation --> neg_mean_squared_error: -6.9525 Best neg_mean_squared_error: -6.9525 Time iteration: 0.495s Total time: 1.538s Iteration 3 ------------------------------------- Parameters --> {'n_estimators': 481, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 7, 'min_samples_leaf': 2, 'ccp_alpha': 0.001, 'bootstrap': False} Evaluation --> neg_mean_squared_error: -5.0279 Best neg_mean_squared_error: -5.0279 Time iteration: 0.744s Total time: 2.388s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 460, 'max_depth': 5, 'max_features': 1.0, 'criterion': 'mae', 'min_samples_split': 5, 'min_samples_leaf': 4, 'ccp_alpha': 0.034, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.3319 Best neg_mean_squared_error: -5.0279 Time iteration: 5.020s Total time: 7.517s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 474, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'mae', 'min_samples_split': 20, 'min_samples_leaf': 1, 'ccp_alpha': 0.018, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.5183 Best neg_mean_squared_error: -5.0279 Time iteration: 4.067s Total time: 11.690s Results for Extra-Trees: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 481, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 7, 'min_samples_leaf': 2, 'ccp_alpha': 0.001, 'bootstrap': False} Best evaluation --> neg_mean_squared_error: -5.0279 Time elapsed: 11.801s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -4.5366 Score on the test set --> neg_mean_squared_error: -4.4905 Time elapsed: 0.968s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -4.5803 \u00b1 0.0691 Time elapsed: 4.259s ------------------------------------------------- Total time: 17.032s Final results ========================= >> Duration: 23.606s ------------------------------------------ Decision Tree --> neg_mean_squared_error: -5.554 \u00b1 0.115 ~ Bagging Regressor --> neg_mean_squared_error: -5.236 \u00b1 0.110 ~ Extra-Trees --> neg_mean_squared_error: -4.580 \u00b1 0.069 ~ ! Analyze the results # For regression tasks, use the errors or residuals plots to check the model performances atom.plot_residuals() # Use the partial dependence plot to analyze the relation between the target response and the features atom.n_jobs = 8 # The method can be slow... atom.ET.plot_partial_dependence(features=(0, 1, (2, 3)), figsize=(12, 8))","title":"Regression"},{"location":"examples/regression/regression/#regression","text":"This example shows how to use ATOM to apply PCA on the data and run a regression pipeline. Download the abalone dataset from https://archive.ics.uci.edu/ml/datasets/Abalone . The goal of this dataset is to predict the rings (age) of abalone shells from physical measurements.","title":"Regression"},{"location":"examples/regression/regression/#load-the-data","text":"# Import packages import pandas as pd from atom import ATOMRegressor # Load the abalone dataset X = pd.read_csv('./datasets/abalone.csv') # Let's have a look at the data X.head() .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 # Initialize ATOM for regression tasks and encode the categorical features atom = ATOMRegressor(X, y=\"Rings\", verbose=2, random_state=42) atom.encode() << ================== ATOM ================== >> Algorithm task: regression. Applying data cleaning... Dataset stats ================= >> Shape: (4177, 9) Categorical columns: 1 Scaled: False ---------------------------------- Train set size: 3342 Test set size: 835 Fitting Encoder... Encoding categorical columns... --> OneHot-encoding feature Sex. Contains 3 unique categories. is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead # Plot the dataset's correlation matrix atom.plot_correlation() # Apply PCA for dimensionality reduction atom.feature_selection(strategy=\"pca\", n_features=6) Fitting FeatureSelector... Performing feature selection ... --> Applying Principal Component Analysis... >>> Scaling features... >>> Total explained variance: 0.976 # Use the plotting methods to see the retained variance ratio atom.plot_pca() atom.plot_components(figsize=(8, 6), filename='atom_PCA_plot')","title":"Load the data"},{"location":"examples/regression/regression/#run-the-pipeline","text":"atom.run(['Tree', 'Bag', 'ET'], metric='MSE', n_calls=5, n_initial_points=2, bo_params={'base_estimator': 'GBRT', 'cv': 1}, bagging=5) Running pipeline ============================= >> Models in pipeline: Tree, Bag, ET Metric: neg_mean_squared_error Running BO for Decision Tree... Random start 1 ---------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'random', 'max_depth': 5, 'max_features': 0.9, 'min_samples_split': 8, 'min_samples_leaf': 19, 'ccp_alpha': 0.003} Evaluation --> neg_mean_squared_error: -7.8759 Best neg_mean_squared_error: -7.8759 Time iteration: 0.043s Total time: 0.048s Random start 2 ---------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'best', 'max_depth': 10, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.033} Evaluation --> neg_mean_squared_error: -9.1854 Best neg_mean_squared_error: -7.8759 Time iteration: 0.181s Total time: 0.233s Iteration 3 ------------------------------------- Parameters --> {'criterion': 'friedman_mse', 'splitter': 'random', 'max_depth': 7, 'max_features': 0.6, 'min_samples_split': 17, 'min_samples_leaf': 19, 'ccp_alpha': 0.015} Evaluation --> neg_mean_squared_error: -8.2130 Best neg_mean_squared_error: -7.8759 Time iteration: 0.007s Total time: 0.428s Iteration 4 ------------------------------------- Parameters --> {'criterion': 'friedman_mse', 'splitter': 'best', 'max_depth': 4, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.006} Evaluation --> neg_mean_squared_error: -6.7540 Best neg_mean_squared_error: -6.7540 Time iteration: 0.010s Total time: 0.533s Iteration 5 ------------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'best', 'max_depth': 3, 'max_features': 0.9, 'min_samples_split': 7, 'min_samples_leaf': 6, 'ccp_alpha': 0.007} Evaluation --> neg_mean_squared_error: -7.2855 Best neg_mean_squared_error: -6.7540 Time iteration: 0.132s Total time: 0.757s Results for Decision Tree: Bayesian Optimization --------------------------- Best parameters --> {'criterion': 'friedman_mse', 'splitter': 'best', 'max_depth': 4, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.006} Best evaluation --> neg_mean_squared_error: -6.7540 Time elapsed: 0.855s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -6.3636 Score on the test set --> neg_mean_squared_error: -5.4433 Time elapsed: 0.011s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -5.5541 \u00b1 0.1150 Time elapsed: 0.039s ------------------------------------------------- Total time: 0.910s Running BO for Bagging Regressor... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 112, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -5.7680 Best neg_mean_squared_error: -5.7680 Time iteration: 0.877s Total time: 0.881s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 131, 'max_samples': 0.5, 'max_features': 0.5, 'bootstrap': False, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -6.8254 Best neg_mean_squared_error: -5.7680 Time iteration: 0.585s Total time: 1.471s Iteration 3 ------------------------------------- Parameters --> {'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': True} Evaluation --> neg_mean_squared_error: -5.4895 Best neg_mean_squared_error: -5.4895 Time iteration: 0.389s Total time: 1.953s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 74, 'max_samples': 0.5, 'max_features': 0.5, 'bootstrap': False, 'bootstrap_features': True} Evaluation --> neg_mean_squared_error: -6.0363 Best neg_mean_squared_error: -5.4895 Time iteration: 0.330s Total time: 2.381s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 36, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': True, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -6.0037 Best neg_mean_squared_error: -5.4895 Time iteration: 0.194s Total time: 2.668s Results for Bagging Regressor: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': True} Best evaluation --> neg_mean_squared_error: -5.4895 Time elapsed: 2.764s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -0.0867 Score on the test set --> neg_mean_squared_error: -4.9533 Time elapsed: 0.571s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -5.2363 \u00b1 0.1099 Time elapsed: 2.325s ------------------------------------------------- Total time: 5.662s Running BO for Extra-Trees... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 112, 'max_depth': 6, 'max_features': 1.0, 'criterion': 'mae', 'min_samples_split': 8, 'min_samples_leaf': 19, 'ccp_alpha': 0.003, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.1995 Best neg_mean_squared_error: -7.1995 Time iteration: 1.034s Total time: 1.040s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 369, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 13, 'min_samples_leaf': 6, 'ccp_alpha': 0.0, 'bootstrap': False} Evaluation --> neg_mean_squared_error: -6.9525 Best neg_mean_squared_error: -6.9525 Time iteration: 0.495s Total time: 1.538s Iteration 3 ------------------------------------- Parameters --> {'n_estimators': 481, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 7, 'min_samples_leaf': 2, 'ccp_alpha': 0.001, 'bootstrap': False} Evaluation --> neg_mean_squared_error: -5.0279 Best neg_mean_squared_error: -5.0279 Time iteration: 0.744s Total time: 2.388s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 460, 'max_depth': 5, 'max_features': 1.0, 'criterion': 'mae', 'min_samples_split': 5, 'min_samples_leaf': 4, 'ccp_alpha': 0.034, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.3319 Best neg_mean_squared_error: -5.0279 Time iteration: 5.020s Total time: 7.517s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 474, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'mae', 'min_samples_split': 20, 'min_samples_leaf': 1, 'ccp_alpha': 0.018, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.5183 Best neg_mean_squared_error: -5.0279 Time iteration: 4.067s Total time: 11.690s Results for Extra-Trees: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 481, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 7, 'min_samples_leaf': 2, 'ccp_alpha': 0.001, 'bootstrap': False} Best evaluation --> neg_mean_squared_error: -5.0279 Time elapsed: 11.801s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -4.5366 Score on the test set --> neg_mean_squared_error: -4.4905 Time elapsed: 0.968s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -4.5803 \u00b1 0.0691 Time elapsed: 4.259s ------------------------------------------------- Total time: 17.032s Final results ========================= >> Duration: 23.606s ------------------------------------------ Decision Tree --> neg_mean_squared_error: -5.554 \u00b1 0.115 ~ Bagging Regressor --> neg_mean_squared_error: -5.236 \u00b1 0.110 ~ Extra-Trees --> neg_mean_squared_error: -4.580 \u00b1 0.069 ~ !","title":"Run the pipeline"},{"location":"examples/regression/regression/#analyze-the-results","text":"# For regression tasks, use the errors or residuals plots to check the model performances atom.plot_residuals() # Use the partial dependence plot to analyze the relation between the target response and the features atom.n_jobs = 8 # The method can be slow... atom.ET.plot_partial_dependence(features=(0, 1, (2, 3)), figsize=(12, 8))","title":"Analyze the results"},{"location":"examples/successive_halving/successive_halving/","text":"Successive halving This example shows how to compare multiple tree-based models using successive halving. Import the boston dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict house prices. Load the data # Import packages from sklearn.datasets import load_boston from atom import ATOMRegressor # Load the dataset's features and targets X, y = load_boston(return_X_y=True) Run the pipeline atom = ATOMRegressor(X, y, verbose=1, random_state=1) << ================== ATOM ================== >> Algorithm task: regression. Applying data cleaning... Dataset stats ================= >> Shape: (506, 14) Scaled: False ---------------------------------- Train set size: 405 Test set size: 101 # We can compare tree-based models via successive halving atom.successive_halving(['tree', 'bag', 'et', 'rf', 'lgb', 'catb'], metric='mae', bagging=5) Running pipeline ============================= >> Metric: neg_mean_absolute_error Run 0 (17% of set) ============================>> Models in pipeline: Tree, Bag, ET, RF, LGB, CatB Size of training set: 67 Size of test set: 101 Results for Decision Tree: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -3.3257 Time elapsed: 0.007s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -4.3307 \u00b1 0.5250 Time elapsed: 0.018s ------------------------------------------------- Total time: 0.027s Results for Bagging Regressor: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -1.3054 Score on the test set --> neg_mean_absolute_error: -2.6950 Time elapsed: 0.018s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -3.0957 \u00b1 0.2677 Time elapsed: 0.079s ------------------------------------------------- Total time: 0.100s Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -2.1541 Time elapsed: 0.084s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.5554 \u00b1 0.1708 Time elapsed: 0.357s ------------------------------------------------- Total time: 0.443s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -1.1509 Score on the test set --> neg_mean_absolute_error: -2.4143 Time elapsed: 0.109s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.9574 \u00b1 0.2253 Time elapsed: 0.509s ------------------------------------------------- Total time: 0.621s Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -3.4205 Score on the test set --> neg_mean_absolute_error: -4.5600 Time elapsed: 0.027s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -4.8393 \u00b1 0.2682 Time elapsed: 0.060s ------------------------------------------------- Total time: 0.091s Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0806 Score on the test set --> neg_mean_absolute_error: -2.3984 Time elapsed: 0.846s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.9165 \u00b1 0.2564 Time elapsed: 2.764s ------------------------------------------------- Total time: 3.611s Final results ========================= >> Duration: 4.894s ------------------------------------------ Decision Tree --> neg_mean_absolute_error: -4.331 \u00b1 0.525 ~ Bagging Regressor --> neg_mean_absolute_error: -3.096 \u00b1 0.268 ~ Extra-Trees --> neg_mean_absolute_error: -2.555 \u00b1 0.171 ~ ! Random Forest --> neg_mean_absolute_error: -2.957 \u00b1 0.225 ~ LightGBM --> neg_mean_absolute_error: -4.839 \u00b1 0.268 ~ CatBoost --> neg_mean_absolute_error: -2.916 \u00b1 0.256 ~ Run 1 (33% of set) ============================>> Models in pipeline: ET, CatB, RF Size of training set: 135 Size of test set: 101 Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -2.2361 Time elapsed: 0.098s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.6016 \u00b1 0.2890 Time elapsed: 0.414s ------------------------------------------------- Total time: 0.514s Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.2835 Score on the test set --> neg_mean_absolute_error: -2.4196 Time elapsed: 0.815s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.5681 \u00b1 0.2119 Time elapsed: 3.124s ------------------------------------------------- Total time: 3.942s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.9820 Score on the test set --> neg_mean_absolute_error: -2.5055 Time elapsed: 0.129s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.6144 \u00b1 0.1188 Time elapsed: 0.590s ------------------------------------------------- Total time: 0.721s Final results ========================= >> Duration: 5.178s ------------------------------------------ Extra-Trees --> neg_mean_absolute_error: -2.602 \u00b1 0.289 ~ CatBoost --> neg_mean_absolute_error: -2.568 \u00b1 0.212 ~ ! Random Forest --> neg_mean_absolute_error: -2.614 \u00b1 0.119 ~ Run 2 (100% of set) ===========================>> Models in pipeline: CatB Size of training set: 405 Size of test set: 101 Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.3978 Score on the test set --> neg_mean_absolute_error: -1.8772 Time elapsed: 1.207s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.0501 \u00b1 0.0892 Time elapsed: 5.234s ------------------------------------------------- Total time: 6.444s Final results ========================= >> Duration: 6.445s ------------------------------------------ CatBoost --> neg_mean_absolute_error: -2.050 \u00b1 0.089 ~ Analyze results # Note that the results dataframe now is multi-index atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name metric_train metric_test time_fit mean_bagging std_bagging time_bagging time run model 0 Tree Decision Tree -0.000000e+00 -3.325743 0.007s -4.330693 0.525026 0.018s 0.027s Bag Bagging Regressor -1.305373e+00 -2.695050 0.018s -3.095663 0.267668 0.079s 0.100s ET Extra-Trees -2.256238e-14 -2.154089 0.084s -2.555434 0.170823 0.357s 0.443s RF Random Forest -1.150866e+00 -2.414297 0.109s -2.957400 0.225311 0.509s 0.621s LGB LightGBM -3.420518e+00 -4.559962 0.027s -4.839315 0.268167 0.060s 0.091s CatB CatBoost -8.055503e-02 -2.398431 0.846s -2.916470 0.256428 2.764s 3.611s 1 ET Extra-Trees -2.315185e-14 -2.236079 0.098s -2.601648 0.289034 0.414s 0.514s CatB CatBoost -2.835499e-01 -2.419625 0.815s -2.568085 0.211868 3.124s 3.942s RF Random Forest -9.819778e-01 -2.505465 0.129s -2.614416 0.118758 0.590s 0.721s 2 CatB CatBoost -3.977985e-01 -1.877205 1.207s -2.050118 0.089185 5.234s 6.444s # Plot the successive halving's results atom.plot_successive_halving()","title":"Successive halving"},{"location":"examples/successive_halving/successive_halving/#successive-halving","text":"This example shows how to compare multiple tree-based models using successive halving. Import the boston dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict house prices.","title":"Successive halving"},{"location":"examples/successive_halving/successive_halving/#load-the-data","text":"# Import packages from sklearn.datasets import load_boston from atom import ATOMRegressor # Load the dataset's features and targets X, y = load_boston(return_X_y=True)","title":"Load the data"},{"location":"examples/successive_halving/successive_halving/#run-the-pipeline","text":"atom = ATOMRegressor(X, y, verbose=1, random_state=1) << ================== ATOM ================== >> Algorithm task: regression. Applying data cleaning... Dataset stats ================= >> Shape: (506, 14) Scaled: False ---------------------------------- Train set size: 405 Test set size: 101 # We can compare tree-based models via successive halving atom.successive_halving(['tree', 'bag', 'et', 'rf', 'lgb', 'catb'], metric='mae', bagging=5) Running pipeline ============================= >> Metric: neg_mean_absolute_error Run 0 (17% of set) ============================>> Models in pipeline: Tree, Bag, ET, RF, LGB, CatB Size of training set: 67 Size of test set: 101 Results for Decision Tree: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -3.3257 Time elapsed: 0.007s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -4.3307 \u00b1 0.5250 Time elapsed: 0.018s ------------------------------------------------- Total time: 0.027s Results for Bagging Regressor: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -1.3054 Score on the test set --> neg_mean_absolute_error: -2.6950 Time elapsed: 0.018s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -3.0957 \u00b1 0.2677 Time elapsed: 0.079s ------------------------------------------------- Total time: 0.100s Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -2.1541 Time elapsed: 0.084s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.5554 \u00b1 0.1708 Time elapsed: 0.357s ------------------------------------------------- Total time: 0.443s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -1.1509 Score on the test set --> neg_mean_absolute_error: -2.4143 Time elapsed: 0.109s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.9574 \u00b1 0.2253 Time elapsed: 0.509s ------------------------------------------------- Total time: 0.621s Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -3.4205 Score on the test set --> neg_mean_absolute_error: -4.5600 Time elapsed: 0.027s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -4.8393 \u00b1 0.2682 Time elapsed: 0.060s ------------------------------------------------- Total time: 0.091s Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0806 Score on the test set --> neg_mean_absolute_error: -2.3984 Time elapsed: 0.846s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.9165 \u00b1 0.2564 Time elapsed: 2.764s ------------------------------------------------- Total time: 3.611s Final results ========================= >> Duration: 4.894s ------------------------------------------ Decision Tree --> neg_mean_absolute_error: -4.331 \u00b1 0.525 ~ Bagging Regressor --> neg_mean_absolute_error: -3.096 \u00b1 0.268 ~ Extra-Trees --> neg_mean_absolute_error: -2.555 \u00b1 0.171 ~ ! Random Forest --> neg_mean_absolute_error: -2.957 \u00b1 0.225 ~ LightGBM --> neg_mean_absolute_error: -4.839 \u00b1 0.268 ~ CatBoost --> neg_mean_absolute_error: -2.916 \u00b1 0.256 ~ Run 1 (33% of set) ============================>> Models in pipeline: ET, CatB, RF Size of training set: 135 Size of test set: 101 Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -2.2361 Time elapsed: 0.098s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.6016 \u00b1 0.2890 Time elapsed: 0.414s ------------------------------------------------- Total time: 0.514s Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.2835 Score on the test set --> neg_mean_absolute_error: -2.4196 Time elapsed: 0.815s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.5681 \u00b1 0.2119 Time elapsed: 3.124s ------------------------------------------------- Total time: 3.942s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.9820 Score on the test set --> neg_mean_absolute_error: -2.5055 Time elapsed: 0.129s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.6144 \u00b1 0.1188 Time elapsed: 0.590s ------------------------------------------------- Total time: 0.721s Final results ========================= >> Duration: 5.178s ------------------------------------------ Extra-Trees --> neg_mean_absolute_error: -2.602 \u00b1 0.289 ~ CatBoost --> neg_mean_absolute_error: -2.568 \u00b1 0.212 ~ ! Random Forest --> neg_mean_absolute_error: -2.614 \u00b1 0.119 ~ Run 2 (100% of set) ===========================>> Models in pipeline: CatB Size of training set: 405 Size of test set: 101 Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.3978 Score on the test set --> neg_mean_absolute_error: -1.8772 Time elapsed: 1.207s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.0501 \u00b1 0.0892 Time elapsed: 5.234s ------------------------------------------------- Total time: 6.444s Final results ========================= >> Duration: 6.445s ------------------------------------------ CatBoost --> neg_mean_absolute_error: -2.050 \u00b1 0.089 ~","title":"Run the pipeline"},{"location":"examples/successive_halving/successive_halving/#analyze-results","text":"# Note that the results dataframe now is multi-index atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name metric_train metric_test time_fit mean_bagging std_bagging time_bagging time run model 0 Tree Decision Tree -0.000000e+00 -3.325743 0.007s -4.330693 0.525026 0.018s 0.027s Bag Bagging Regressor -1.305373e+00 -2.695050 0.018s -3.095663 0.267668 0.079s 0.100s ET Extra-Trees -2.256238e-14 -2.154089 0.084s -2.555434 0.170823 0.357s 0.443s RF Random Forest -1.150866e+00 -2.414297 0.109s -2.957400 0.225311 0.509s 0.621s LGB LightGBM -3.420518e+00 -4.559962 0.027s -4.839315 0.268167 0.060s 0.091s CatB CatBoost -8.055503e-02 -2.398431 0.846s -2.916470 0.256428 2.764s 3.611s 1 ET Extra-Trees -2.315185e-14 -2.236079 0.098s -2.601648 0.289034 0.414s 0.514s CatB CatBoost -2.835499e-01 -2.419625 0.815s -2.568085 0.211868 3.124s 3.942s RF Random Forest -9.819778e-01 -2.505465 0.129s -2.614416 0.118758 0.590s 0.721s 2 CatB CatBoost -3.977985e-01 -1.877205 1.207s -2.050118 0.089185 5.234s 6.444s # Plot the successive halving's results atom.plot_successive_halving()","title":"Analyze results"},{"location":"examples/train_sizing/train_sizing/","text":"Train sizing This example shows how to asses a model's performance based on the size of the training set. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package. The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow. Load the data # Import packages import numpy as np import pandas as pd from atom import ATOMClassifier # Load the Australian weather dataset X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 3118 BadgerysCreek 11.7 23.2 0.0 NaN NaN SW 28.0 18965 NorahHead 10.2 19.4 0.0 NaN NaN SSE 30.0 11196 CoffsHarbour 9.7 21.2 0.0 NaN NaN NW 26.0 62283 Sale 8.4 21.7 0.0 NaN NaN WSW 41.0 92461 Townsville 11.1 27.1 0.0 7.6 10.7 ENE 37.0 Run the pipeline # Initialize ATOM and prepare the data atom = ATOMClassifier(X, verbose=2, random_state=1) atom.impute(strat_num='median', strat_cat='most_frequent', min_frac_rows=0.8) atom.encode() << ================== ATOM ================== >> Algorithm task: binary classification. Applying data cleaning... Dataset stats ================= >> Shape: (142193, 22) Missing values: 292032 Categorical columns: 5 Scaled: False ---------------------------------- Size of training set: 113755 Size of test set: 28438 ---------------------------------- Class balance: No:Yes <==> 3.5:1.0 Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 110316 | 88263 | 22053 | | 1: Yes | 31877 | 25492 | 6385 | Fitting Imputer... Imputing missing values... --> Dropping 15182 rows for containing less than 80% non-missing values. --> Imputing 100 missing values with median in feature MinTemp. --> Imputing 57 missing values with median in feature MaxTemp. --> Imputing 640 missing values with median in feature Rainfall. --> Imputing 46535 missing values with median in feature Evaporation. --> Imputing 53034 missing values with median in feature Sunshine. --> Imputing 4381 missing values with most_frequent in feature WindGustDir. --> Imputing 4359 missing values with median in feature WindGustSpeed. --> Imputing 6624 missing values with most_frequent in feature WindDir9am. --> Imputing 612 missing values with most_frequent in feature WindDir3pm. --> Imputing 80 missing values with median in feature WindSpeed9am. --> Imputing 49 missing values with median in feature WindSpeed3pm. --> Imputing 532 missing values with median in feature Humidity9am. --> Imputing 1168 missing values with median in feature Humidity3pm. --> Imputing 1028 missing values with median in feature Pressure9am. --> Imputing 972 missing values with median in feature Pressure3pm. --> Imputing 42172 missing values with median in feature Cloud9am. --> Imputing 44251 missing values with median in feature Cloud3pm. --> Imputing 98 missing values with median in feature Temp9am. --> Imputing 702 missing values with median in feature Temp3pm. --> Imputing 640 missing values with most_frequent in feature RainToday. Fitting Encoder... Encoding categorical columns... --> Target-encoding feature Location. Contains 45 unique categories. --> Target-encoding feature WindGustDir. Contains 16 unique categories. --> Target-encoding feature WindDir9am. Contains 16 unique categories. --> Target-encoding feature WindDir3pm. Contains 16 unique categories. --> Label-encoding feature RainToday. Contains 2 unique categories. # We can analyze the impact of the training set's size on a LightGBM model atom.train_sizing('lgb', train_sizes=np.linspace(0.1, 1, 9), bagging=4) Running pipeline ============================= >> Models in pipeline: LGB Metric: f1 Run 0 (10% of set) ============================>> Size of training set: 11375 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.8029 Score on the test set --> f1: 0.6086 Time elapsed: 0.998s Bagging ----------------------------------------- Score --> f1: 0.5945 \u00b1 0.0073 Time elapsed: 2.229s ------------------------------------------------- Total time: 3.242s Final results ========================= >> Duration: 3.244s ------------------------------------------ LightGBM --> f1: 0.594 \u00b1 0.007 ~ Run 1 (21% of set) ============================>> Size of training set: 24172 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.7292 Score on the test set --> f1: 0.6273 Time elapsed: 1.244s Bagging ----------------------------------------- Score --> f1: 0.6166 \u00b1 0.0053 Time elapsed: 2.879s ------------------------------------------------- Total time: 4.129s Final results ========================= >> Duration: 4.131s ------------------------------------------ LightGBM --> f1: 0.617 \u00b1 0.005 Run 2 (32% of set) ============================>> Size of training set: 36970 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6955 Score on the test set --> f1: 0.6325 Time elapsed: 1.533s Bagging ----------------------------------------- Score --> f1: 0.6199 \u00b1 0.0038 Time elapsed: 3.502s ------------------------------------------------- Total time: 5.039s Final results ========================= >> Duration: 5.042s ------------------------------------------ LightGBM --> f1: 0.620 \u00b1 0.004 Run 3 (44% of set) ============================>> Size of training set: 49767 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6832 Score on the test set --> f1: 0.6386 Time elapsed: 1.825s Bagging ----------------------------------------- Score --> f1: 0.6256 \u00b1 0.0036 Time elapsed: 4.148s ------------------------------------------------- Total time: 5.979s Final results ========================= >> Duration: 5.981s ------------------------------------------ LightGBM --> f1: 0.626 \u00b1 0.004 Run 4 (55% of set) ============================>> Size of training set: 62565 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6818 Score on the test set --> f1: 0.6391 Time elapsed: 2.152s Bagging ----------------------------------------- Score --> f1: 0.6271 \u00b1 0.0025 Time elapsed: 4.838s ------------------------------------------------- Total time: 6.996s Final results ========================= >> Duration: 6.998s ------------------------------------------ LightGBM --> f1: 0.627 \u00b1 0.002 Run 5 (66% of set) ============================>> Size of training set: 75362 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6767 Score on the test set --> f1: 0.6399 Time elapsed: 2.418s Bagging ----------------------------------------- Score --> f1: 0.6346 \u00b1 0.0021 Time elapsed: 5.622s ------------------------------------------------- Total time: 8.045s Final results ========================= >> Duration: 8.047s ------------------------------------------ LightGBM --> f1: 0.635 \u00b1 0.002 Run 6 (77% of set) ============================>> Size of training set: 88160 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6665 Score on the test set --> f1: 0.6384 Time elapsed: 2.810s Bagging ----------------------------------------- Score --> f1: 0.6342 \u00b1 0.0021 Time elapsed: 6.240s ------------------------------------------------- Total time: 9.058s Final results ========================= >> Duration: 9.060s ------------------------------------------ LightGBM --> f1: 0.634 \u00b1 0.002 Run 7 (89% of set) ============================>> Size of training set: 100957 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6651 Score on the test set --> f1: 0.6432 Time elapsed: 3.063s Bagging ----------------------------------------- Score --> f1: 0.6372 \u00b1 0.0025 Time elapsed: 6.888s ------------------------------------------------- Total time: 9.958s Final results ========================= >> Duration: 9.960s ------------------------------------------ LightGBM --> f1: 0.637 \u00b1 0.003 Run 8 (100% of set) ===========================>> Size of training set: 113755 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6650 Score on the test set --> f1: 0.6549 Time elapsed: 3.379s Bagging ----------------------------------------- Score --> f1: 0.6508 \u00b1 0.0026 Time elapsed: 7.621s ------------------------------------------------- Total time: 11.009s Final results ========================= >> Duration: 11.012s ------------------------------------------ LightGBM --> f1: 0.651 \u00b1 0.003 Analyze the results # Note that the results dataframe now is multi-index atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name score_train score_test time_fit mean_bagging std_bagging time_bagging time run model 0 LGB LightGBM 0.802859 0.608590 0.998s 0.594472 0.007341 2.229s 3.242s 1 LGB LightGBM 0.729212 0.627277 1.244s 0.616583 0.005321 2.879s 4.129s 2 LGB LightGBM 0.695463 0.632544 1.533s 0.619899 0.003822 3.502s 5.039s 3 LGB LightGBM 0.683228 0.638575 1.825s 0.625589 0.003608 4.148s 5.979s 4 LGB LightGBM 0.681811 0.639062 2.152s 0.627105 0.002460 4.838s 6.996s 5 LGB LightGBM 0.676747 0.639897 2.418s 0.634642 0.002138 5.622s 8.045s 6 LGB LightGBM 0.666471 0.638376 2.810s 0.634245 0.002098 6.240s 9.058s 7 LGB LightGBM 0.665065 0.643197 3.063s 0.637232 0.002537 6.888s 9.958s 8 LGB LightGBM 0.665018 0.654904 3.379s 0.650772 0.002577 7.621s 11.009s # Plot the train sizing's results atom.plot_learning_curve()","title":"Train sizing"},{"location":"examples/train_sizing/train_sizing/#train-sizing","text":"This example shows how to asses a model's performance based on the size of the training set. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package. The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow.","title":"Train sizing"},{"location":"examples/train_sizing/train_sizing/#load-the-data","text":"# Import packages import numpy as np import pandas as pd from atom import ATOMClassifier # Load the Australian weather dataset X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 3118 BadgerysCreek 11.7 23.2 0.0 NaN NaN SW 28.0 18965 NorahHead 10.2 19.4 0.0 NaN NaN SSE 30.0 11196 CoffsHarbour 9.7 21.2 0.0 NaN NaN NW 26.0 62283 Sale 8.4 21.7 0.0 NaN NaN WSW 41.0 92461 Townsville 11.1 27.1 0.0 7.6 10.7 ENE 37.0","title":"Load the data"},{"location":"examples/train_sizing/train_sizing/#run-the-pipeline","text":"# Initialize ATOM and prepare the data atom = ATOMClassifier(X, verbose=2, random_state=1) atom.impute(strat_num='median', strat_cat='most_frequent', min_frac_rows=0.8) atom.encode() << ================== ATOM ================== >> Algorithm task: binary classification. Applying data cleaning... Dataset stats ================= >> Shape: (142193, 22) Missing values: 292032 Categorical columns: 5 Scaled: False ---------------------------------- Size of training set: 113755 Size of test set: 28438 ---------------------------------- Class balance: No:Yes <==> 3.5:1.0 Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 110316 | 88263 | 22053 | | 1: Yes | 31877 | 25492 | 6385 | Fitting Imputer... Imputing missing values... --> Dropping 15182 rows for containing less than 80% non-missing values. --> Imputing 100 missing values with median in feature MinTemp. --> Imputing 57 missing values with median in feature MaxTemp. --> Imputing 640 missing values with median in feature Rainfall. --> Imputing 46535 missing values with median in feature Evaporation. --> Imputing 53034 missing values with median in feature Sunshine. --> Imputing 4381 missing values with most_frequent in feature WindGustDir. --> Imputing 4359 missing values with median in feature WindGustSpeed. --> Imputing 6624 missing values with most_frequent in feature WindDir9am. --> Imputing 612 missing values with most_frequent in feature WindDir3pm. --> Imputing 80 missing values with median in feature WindSpeed9am. --> Imputing 49 missing values with median in feature WindSpeed3pm. --> Imputing 532 missing values with median in feature Humidity9am. --> Imputing 1168 missing values with median in feature Humidity3pm. --> Imputing 1028 missing values with median in feature Pressure9am. --> Imputing 972 missing values with median in feature Pressure3pm. --> Imputing 42172 missing values with median in feature Cloud9am. --> Imputing 44251 missing values with median in feature Cloud3pm. --> Imputing 98 missing values with median in feature Temp9am. --> Imputing 702 missing values with median in feature Temp3pm. --> Imputing 640 missing values with most_frequent in feature RainToday. Fitting Encoder... Encoding categorical columns... --> Target-encoding feature Location. Contains 45 unique categories. --> Target-encoding feature WindGustDir. Contains 16 unique categories. --> Target-encoding feature WindDir9am. Contains 16 unique categories. --> Target-encoding feature WindDir3pm. Contains 16 unique categories. --> Label-encoding feature RainToday. Contains 2 unique categories. # We can analyze the impact of the training set's size on a LightGBM model atom.train_sizing('lgb', train_sizes=np.linspace(0.1, 1, 9), bagging=4) Running pipeline ============================= >> Models in pipeline: LGB Metric: f1 Run 0 (10% of set) ============================>> Size of training set: 11375 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.8029 Score on the test set --> f1: 0.6086 Time elapsed: 0.998s Bagging ----------------------------------------- Score --> f1: 0.5945 \u00b1 0.0073 Time elapsed: 2.229s ------------------------------------------------- Total time: 3.242s Final results ========================= >> Duration: 3.244s ------------------------------------------ LightGBM --> f1: 0.594 \u00b1 0.007 ~ Run 1 (21% of set) ============================>> Size of training set: 24172 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.7292 Score on the test set --> f1: 0.6273 Time elapsed: 1.244s Bagging ----------------------------------------- Score --> f1: 0.6166 \u00b1 0.0053 Time elapsed: 2.879s ------------------------------------------------- Total time: 4.129s Final results ========================= >> Duration: 4.131s ------------------------------------------ LightGBM --> f1: 0.617 \u00b1 0.005 Run 2 (32% of set) ============================>> Size of training set: 36970 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6955 Score on the test set --> f1: 0.6325 Time elapsed: 1.533s Bagging ----------------------------------------- Score --> f1: 0.6199 \u00b1 0.0038 Time elapsed: 3.502s ------------------------------------------------- Total time: 5.039s Final results ========================= >> Duration: 5.042s ------------------------------------------ LightGBM --> f1: 0.620 \u00b1 0.004 Run 3 (44% of set) ============================>> Size of training set: 49767 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6832 Score on the test set --> f1: 0.6386 Time elapsed: 1.825s Bagging ----------------------------------------- Score --> f1: 0.6256 \u00b1 0.0036 Time elapsed: 4.148s ------------------------------------------------- Total time: 5.979s Final results ========================= >> Duration: 5.981s ------------------------------------------ LightGBM --> f1: 0.626 \u00b1 0.004 Run 4 (55% of set) ============================>> Size of training set: 62565 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6818 Score on the test set --> f1: 0.6391 Time elapsed: 2.152s Bagging ----------------------------------------- Score --> f1: 0.6271 \u00b1 0.0025 Time elapsed: 4.838s ------------------------------------------------- Total time: 6.996s Final results ========================= >> Duration: 6.998s ------------------------------------------ LightGBM --> f1: 0.627 \u00b1 0.002 Run 5 (66% of set) ============================>> Size of training set: 75362 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6767 Score on the test set --> f1: 0.6399 Time elapsed: 2.418s Bagging ----------------------------------------- Score --> f1: 0.6346 \u00b1 0.0021 Time elapsed: 5.622s ------------------------------------------------- Total time: 8.045s Final results ========================= >> Duration: 8.047s ------------------------------------------ LightGBM --> f1: 0.635 \u00b1 0.002 Run 6 (77% of set) ============================>> Size of training set: 88160 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6665 Score on the test set --> f1: 0.6384 Time elapsed: 2.810s Bagging ----------------------------------------- Score --> f1: 0.6342 \u00b1 0.0021 Time elapsed: 6.240s ------------------------------------------------- Total time: 9.058s Final results ========================= >> Duration: 9.060s ------------------------------------------ LightGBM --> f1: 0.634 \u00b1 0.002 Run 7 (89% of set) ============================>> Size of training set: 100957 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6651 Score on the test set --> f1: 0.6432 Time elapsed: 3.063s Bagging ----------------------------------------- Score --> f1: 0.6372 \u00b1 0.0025 Time elapsed: 6.888s ------------------------------------------------- Total time: 9.958s Final results ========================= >> Duration: 9.960s ------------------------------------------ LightGBM --> f1: 0.637 \u00b1 0.003 Run 8 (100% of set) ===========================>> Size of training set: 113755 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6650 Score on the test set --> f1: 0.6549 Time elapsed: 3.379s Bagging ----------------------------------------- Score --> f1: 0.6508 \u00b1 0.0026 Time elapsed: 7.621s ------------------------------------------------- Total time: 11.009s Final results ========================= >> Duration: 11.012s ------------------------------------------ LightGBM --> f1: 0.651 \u00b1 0.003","title":"Run the pipeline"},{"location":"examples/train_sizing/train_sizing/#analyze-the-results","text":"# Note that the results dataframe now is multi-index atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name score_train score_test time_fit mean_bagging std_bagging time_bagging time run model 0 LGB LightGBM 0.802859 0.608590 0.998s 0.594472 0.007341 2.229s 3.242s 1 LGB LightGBM 0.729212 0.627277 1.244s 0.616583 0.005321 2.879s 4.129s 2 LGB LightGBM 0.695463 0.632544 1.533s 0.619899 0.003822 3.502s 5.039s 3 LGB LightGBM 0.683228 0.638575 1.825s 0.625589 0.003608 4.148s 5.979s 4 LGB LightGBM 0.681811 0.639062 2.152s 0.627105 0.002460 4.838s 6.996s 5 LGB LightGBM 0.676747 0.639897 2.418s 0.634642 0.002138 5.622s 8.045s 6 LGB LightGBM 0.666471 0.638376 2.810s 0.634245 0.002098 6.240s 9.058s 7 LGB LightGBM 0.665065 0.643197 3.063s 0.637232 0.002537 6.888s 9.958s 8 LGB LightGBM 0.665018 0.654904 3.379s 0.650772 0.002577 7.621s 11.009s # Plot the train sizing's results atom.plot_learning_curve()","title":"Analyze the results"}]} \ No newline at end of file +{"config":{"lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"Automated Tool for Optimized Modelling There is no magic formula in data science that can tell us which type of machine learning algorithm will perform best for a specific use-case. Different models are better suited for different types of data and different problems. At best, you can follow some rough guide on how to approach problems with regard to which model to try on your data, but these are often more confusing than helpful. Best practices tell us to start with a simple model (e.g. linear regression) and build up to more complicated models (e.g. logistic regression -> random forest -> multilayer perceptron) if you are not satisfied with the results. Unfortunately, different models require different data cleaning steps, different type/amount of features, tuning a new set of hyperparameters, etc. Refactoring the code for this purpose can be quite boring and time consuming. Because of this, many data scientists end up just using the model best known to them and fine-tuning this particular model without ever trying different ones. This can result in poor performance (because the model is just not the right one for the task) or in poor time management (because you could have achieved a similar performance with a simpler/faster model). ATOM is here to help us solve these issues. With just a few lines of code, you can perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset. ATOM should be able to provide quick insights on which algorithms perform best for the task at hand and provide an indication of the feasibility of the ML solution. It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you to determine the right model, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance. So, this sounds a bit like AutoML, how is ATOM different than auto-sklearn or TPOT ? Well, ATOM does AutoML in the sense that it helps you find the best model for a specific task, but contrary to the aforementioned packages, it does not actively search for the best model. It simply runs all of them and let you pick the one that you think suites you best. AutoML packages are often black boxes: if you provide data, it will magically return a working model. Although it works great, they often produce complicated pipelines with low explainability, hard to sell to the business. In this, ATOM excels. Every step of the pipeline is accounted for, and using the provided plotting methods, it\u2019s easy to demonstrate why a model is better/worse than the other. Note A data scientist with domain knowledge can outperform ATOM if he applies usecase-specific feature engineering or data cleaning steps! Example steps taken by ATOM's pipeline: Data Cleaning Handle missing values Encode categorical features Remove outliers Balance the dataset Feature engineering Create new non-linear features Remove multi-collinear features Remove features with too low variance Select the most promising features based on a statistical test Train and validate multiple models Select hyperparameters using a Bayesian Optimization approach Train and test the models on the provided data Perform bagging to assess the robustness of the output Analyze the results Get the model scores on various metrics Make plots to compare the model performances","title":"Home"},{"location":"#automated-tool-for-optimized-modelling","text":"There is no magic formula in data science that can tell us which type of machine learning algorithm will perform best for a specific use-case. Different models are better suited for different types of data and different problems. At best, you can follow some rough guide on how to approach problems with regard to which model to try on your data, but these are often more confusing than helpful. Best practices tell us to start with a simple model (e.g. linear regression) and build up to more complicated models (e.g. logistic regression -> random forest -> multilayer perceptron) if you are not satisfied with the results. Unfortunately, different models require different data cleaning steps, different type/amount of features, tuning a new set of hyperparameters, etc. Refactoring the code for this purpose can be quite boring and time consuming. Because of this, many data scientists end up just using the model best known to them and fine-tuning this particular model without ever trying different ones. This can result in poor performance (because the model is just not the right one for the task) or in poor time management (because you could have achieved a similar performance with a simpler/faster model). ATOM is here to help us solve these issues. With just a few lines of code, you can perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset. ATOM should be able to provide quick insights on which algorithms perform best for the task at hand and provide an indication of the feasibility of the ML solution. It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you to determine the right model, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance. So, this sounds a bit like AutoML, how is ATOM different than auto-sklearn or TPOT ? Well, ATOM does AutoML in the sense that it helps you find the best model for a specific task, but contrary to the aforementioned packages, it does not actively search for the best model. It simply runs all of them and let you pick the one that you think suites you best. AutoML packages are often black boxes: if you provide data, it will magically return a working model. Although it works great, they often produce complicated pipelines with low explainability, hard to sell to the business. In this, ATOM excels. Every step of the pipeline is accounted for, and using the provided plotting methods, it\u2019s easy to demonstrate why a model is better/worse than the other. Note A data scientist with domain knowledge can outperform ATOM if he applies usecase-specific feature engineering or data cleaning steps! Example steps taken by ATOM's pipeline: Data Cleaning Handle missing values Encode categorical features Remove outliers Balance the dataset Feature engineering Create new non-linear features Remove multi-collinear features Remove features with too low variance Select the most promising features based on a statistical test Train and validate multiple models Select hyperparameters using a Bayesian Optimization approach Train and test the models on the provided data Perform bagging to assess the robustness of the output Analyze the results Get the model scores on various metrics Make plots to compare the model performances","title":"Automated Tool for Optimized Modelling"},{"location":"dependencies/","text":"Python As of the moment, ATOM supports Python 3.6 , 3.7 and 3.8 . Packages ATOM is built on top of several existing Python libraries. The required packages are necessary for it's correct functioning. Additionally, you can install some optional packages to use machine learning estimators not provided by sklearn. Required numpy (>=1.17.2) scipy (>=1.4.1) pandas (>=1.0.3) tqdm (>=4.35.0) joblib (>=0.16.0) typeguard (>=2.7.1) tabulate (>=0.8.6) scikit-learn (>=0.23.1) scikit-optimize (>=0.7.4) pandas-profiling (>=2.3.0) category-encoders (>=2.1.0) imbalanced-learn (>=0.5.0) featuretools (>=0.17.0) gplearn (>=0.4.1) matplotlib (>=3.3.0) seaborn (>=0.9.0) shap (>=0.36.0) Optional xgboost (>=0.90) lightgbm (>=2.3.0) catboost (>=0.19.1) Support ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.","title":"Dependencies"},{"location":"dependencies/#python","text":"As of the moment, ATOM supports Python 3.6 , 3.7 and 3.8 .","title":"Python"},{"location":"dependencies/#packages","text":"ATOM is built on top of several existing Python libraries. The required packages are necessary for it's correct functioning. Additionally, you can install some optional packages to use machine learning estimators not provided by sklearn.","title":"Packages"},{"location":"dependencies/#required","text":"numpy (>=1.17.2) scipy (>=1.4.1) pandas (>=1.0.3) tqdm (>=4.35.0) joblib (>=0.16.0) typeguard (>=2.7.1) tabulate (>=0.8.6) scikit-learn (>=0.23.1) scikit-optimize (>=0.7.4) pandas-profiling (>=2.3.0) category-encoders (>=2.1.0) imbalanced-learn (>=0.5.0) featuretools (>=0.17.0) gplearn (>=0.4.1) matplotlib (>=3.3.0) seaborn (>=0.9.0) shap (>=0.36.0)","title":"Required"},{"location":"dependencies/#optional","text":"xgboost (>=0.90) lightgbm (>=2.3.0) catboost (>=0.19.1)","title":"Optional"},{"location":"dependencies/#support","text":"ATOM recognizes the support from JetBrains by providing core project contributors with a set of developer tools free of charge.","title":"Support"},{"location":"getting_started/","text":"Installation Note Since atom was already taken, download the package under the name atom-ml ! Intall ATOM's newest release easily via pip : $ pip install -U atom-ml or via conda : $ conda install -c conda-forge atom-ml Usage Call the ATOMClassifier or ATOMRegressor class and provide the data you want to use: from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier X, y = load_breast_cancer(return_X_y) atom = ATOMClassifier(X, y, logger='auto', n_jobs=2, verbose=2) ATOM has multiple data cleaning methods to help you prepare the data for modelling: atom.impute(strat_num='knn', strat_cat='most_frequent', min_frac_rows=0.1) atom.encode(strategy='Target', max_onehot=8, frac_to_other=0.05) atom.feature_selection(strategy='PCA', n_features=12) Train and evaluate the models you want to compare: atom.run(models=['LR', 'LDA', 'XGB', 'lSVM'], metric='f1', n_calls=25, n_initial_points=10, bagging=4) Make plots to analyze the results: atom.plot_bagging(figsize=(9, 6), filename='bagging_results.png') atom.LDA.plot_confusion_matrix(normalize=True, filename='cm.png')","title":"Getting started"},{"location":"getting_started/#installation","text":"Note Since atom was already taken, download the package under the name atom-ml ! Intall ATOM's newest release easily via pip : $ pip install -U atom-ml or via conda : $ conda install -c conda-forge atom-ml","title":"Installation"},{"location":"getting_started/#usage","text":"Call the ATOMClassifier or ATOMRegressor class and provide the data you want to use: from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier X, y = load_breast_cancer(return_X_y) atom = ATOMClassifier(X, y, logger='auto', n_jobs=2, verbose=2) ATOM has multiple data cleaning methods to help you prepare the data for modelling: atom.impute(strat_num='knn', strat_cat='most_frequent', min_frac_rows=0.1) atom.encode(strategy='Target', max_onehot=8, frac_to_other=0.05) atom.feature_selection(strategy='PCA', n_features=12) Train and evaluate the models you want to compare: atom.run(models=['LR', 'LDA', 'XGB', 'lSVM'], metric='f1', n_calls=25, n_initial_points=10, bagging=4) Make plots to analyze the results: atom.plot_bagging(figsize=(9, 6), filename='bagging_results.png') atom.LDA.plot_confusion_matrix(normalize=True, filename='cm.png')","title":"Usage"},{"location":"license/","text":"MIT License Copyright (c) 2020 tvdboom Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.","title":"License"},{"location":"license/#mit-license","text":"Copyright (c) 2020 tvdboom Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.","title":"MIT License"},{"location":"user_guide/","text":"Introduction There is no magic formula in data science that can tell us which type of machine learning algorithm will perform best for a specific use-case. Different models are better suited for different types of data and different problems. At best, you can follow some rough guide on how to approach problems with regard to which model to try on your data, but these are often more confusing than helpful. Best practices tell us to start with a simple model (e.g. linear regression) and build up to more complicated models (e.g. logistic regression -> random forest -> multilayer perceptron) if you are not satisfied with the results. Unfortunately, different models require different data cleaning steps, different type/amount of features, tuning a new set of hyperparameters, etc. Refactoring the code for this purpose can be quite boring and time consuming. Because of this, many data scientists end up just using the model best known to them and fine-tuning this particular model without ever trying different ones. This can result in poor performance (because the model is just not the right one for the task) or in poor time management (because you could have achieved a similar performance with a simpler/faster model). ATOM is here to help us solve these issues. With just a few lines of code, you can perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset. ATOM should be able to provide quick insights on which algorithms perform best for the task at hand and provide an indication of the feasibility of the ML solution. It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you to determine the right model, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance. So, this sounds a bit like AutoML, how is ATOM different than auto-sklearn or TPOT ? Well, ATOM does AutoML in the sense that it helps you find the best model for a specific task, but contrary to the aforementioned packages, it does not actively search for the best model. It simply runs all of them and let you pick the one that you think suites you best. AutoML packages are often black boxes: if you provide data, it will magically return a working model. Although it works great, they often produce complicated pipelines with low explainability, hard to sell to the business. In this, ATOM excels. Every step of the pipeline is accounted for, and using the provided plotting methods, it\u2019s easy to demonstrate why a model is better/worse than the other. Nomenclature In this documentation we will consistently use terms to refer to certain concepts related to the ATOM package. ATOM : Refers to this package. task : Refers to one of the three supervised machine learning approaches that ATOM supports: binary classification multiclass classification regression category : Refers to one of the unique values in a column, i.e. a binary classifier has 2 categories in the target column. missing values : Refers to None , NaN and inf values. categorical columns : Refers to all columns with dtype.kind not in ifu . atom : Refers to an ATOMClassifier or ATOMRegressor instance (note that all examples use it as variable name for the instance). model : Refers to one of the model instances. estimator : Actual estimator corresponding to a model. Implemented by an external package. BO : Bayesian optimization algorithm used for hyperparameter optimization. training : Refers to an instance of one of the classes that train and evaluate the models. The classes are: ATOMClassifier ATOMRegressor TrainerClassifier TrainerRegressor SuccessiveHalvingClassifier SuccessiveHavingRegressor TrainSizingClassifier TrainSizingRegressor Note Note that atom instances are also training instances! First steps You can quickly install atom using pip or conda , see the installation guide . ATOM contains a variety of classes to perform data cleaning, feature engineering, model training and much more. The easiest way to use all these classes on the same dataset is through one of the main classes: ATOMClassifier for binary or multiclass classification tasks. ATOMRegressor for regression tasks. These two classes are convenient wrappers for all the possibilities this package provides. Like a Pipeline , they assemble several steps that can be cross-validated together while setting different parameters. There are some important differences with sklearn's API: atom is initialized with the data you want to manipulate. This data can be accessed at any moment through atom 's data attributes . The classes in ATOM's API are reached through atom 's methods. For example, calling the encode method, will initialize an Encoder instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (there is no fit method). This approach gives the user a clearer overview and more control over every step in the pipeline. The pipeline does not have to end with an estimator. ATOM can be just for data cleaning or feature engineering purposes only. Let's get started with an example! First, initialize atom and provide it the data you want to use. atom = ATOMClassifier(X, y) Apply data cleaning methods through the class. For example, calling the impute method will handle all missing values in the dataset. atom.impute(strat_num='median', strat_cat='most_frequent', min_frac_rows=0.1) Select the best hyperparameters and fit a Random Forest and AdaBoost model. atom.run(['RF', 'AdaB'], metric='accuracy', n_calls=25, n_initial_points=10) Analyze the results: atom.feature_importances(show=10, filename='feature_importance_plot') atom.plot_prc(title='Precision-recall curve comparison plot') Data cleaning More often than not, you need to do some data cleaning before fitting your dataset to a model. Usually, this involves importing different libraries and writing many lines of code. Since ATOM is all about fast exploration and experimentation, it provides various data cleaning classes to apply the most common transformations fast and easy. Scaling the feature set Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance). The Scaler class scales data to mean=0 and std=1. It can be accessed from atom through the scale method. Standard data cleaning There are many data cleaning steps that are useful to perform on any dataset before modelling. These are general rules that apply on every use-case and every task. The StandardCleaner class is a convenient tool to apply such steps. It is automatically called when initializing atom . Use the class' parameters to choose which transformations to perform. The available steps are: Remove columns with prohibited data types. Strip categorical features from white spaces. Remove categorical columns with maximal cardinality. Remove columns with minimum cardinality. Remove rows with missing values in the target column. Label-encode the target column. Imputing missing values For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with ATOM's models which assume that all values in an array are numerical, and that all have and hold meaning. The Imputer class handles missing values in the dataset by either dropping or imputing the value. It can be accessed from atom through the impute method. Tip Use atom 's missing attribute for an overview of the missing values in the dataset. Encoding categorical features Many datasets will contain categorical features. Their variables are typically stored as text values which represent various traits. Some examples include color (\u201cRed\u201d, \u201cYellow\u201d, \u201cBlue\u201d), size (\u201cSmall\u201d, \u201cMedium\u201d, \u201cLarge\u201d) or geographic designations (city or country). Regardless of what the value is used for, the challenge is determining how to use this data in the analysis. ATOM's models don't support direct manipulation of this kind of data. Use the Encoder class to encode categorical features to numerical values. It can be accessed from atom through the encode method. Tip Use atom 's categorical attribute for a list of the categorical columns in the dataset. Handling outliers When modeling, it is important to clean the data sample to ensure that the observations best represent the problem. Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data. These are called outliers. Often, machine learning modeling and model skill in general can be improved by understanding and even removing these outlier values. The Outliers class can drop or impute outliers in the dataset. It can be accessed from atom through the outliers method. Balancing the data One of the common issues found in datasets that are used for classification is imbalanced classes. Data imbalance usually reflects an unequal distribution of classes within a dataset. For example, in a credit card fraud detection dataset, most of the transactions are non-fraud and a very few cases are fraud. This leaves us with a very unbalanced ratio of fraud vs non-fraud cases. The Balancer class can oversample the minority category or undersample the majority category. It can be accessed from atom through the balance method. Feature engineering \"Applied machine learning\" is basically feature engineering. ~ Andrew Ng. Feature engineering is the process of creating new features from the existing ones, in order to capture relationships with the target column that the first set of features didn't had on their own. This process is very important to improve the performance of machine learning algorithms. Although feature engineering works best when the data scientist applies use-case specific transformations, there are ways to do this in an automated manner, without prior domain knowledge. One of the problems of creating new features without human expert intervention, is that many of the newly created features can be useless, i.e. they do not help the algorithm to make better predictions. Even worse, having useless features can drop your performance. To avoid this, we perform feature selection, a process in which we select the relevant features in the dataset. See here an example. Generating new features The FeatureGenerator class creates new non-linear features based on the original feature set. It can be accessed from atom through the feature_generation method. You can choose between two strategies: Deep Feature Synthesis and Genetic Feature Generation. Deep Feature Synthesis Deep feature synthesis (DFS) applies the selected operators on the features in the dataset. For example, if the operator is 'log', it will create the new feature LOG(old_feature) and if the operator is 'mul', it will create the new feature old_feature_1 x old_feature_2 . The operators can be chosen through the operators parameter. Available options are: add: Sum two features together. sub: Subtract two features from each other. mul: Multiply two features with each other. div: Divide two features with each other. srqt: Take the square root of a feature. log: Take the logarithm of a feature. sin: Calculate the sine of a feature. cos: Calculate the cosine of a feature. tan: Calculate the tangent of a feature. ATOM's implementation of DFS uses the featuretools package. Tip DFS can create many new features and not all of them will be useful. Use FeatureSelector to reduce the number of features! Warning Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom 's missing property. Warning When using DFS with n_jobs>1 , make sure to protect your code with if __name__ == \"__main__\" . Featuretools uses dask , which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task. Genetic Feature Generation Genetic feature generation (GFG) uses genetic programming , a branch of evolutionary programming, to determine which features are successful and create new ones based on those. Where DFS' method can be seen as some kind of \"brute force\" for feature engineering, GFG tries to improve its features with every generation of the algorithm. GFG uses the same operators as DFS, but instead of only applying the transformations once, it evolves them further, creating complicated non-linear combinations of features with many transformations. The new features are given the name Feature N for the N-th feature. You can access the genetic feature's fitness and description (how they are calculated) through the genetic_features attribute. ATOM uses the SymbolicTransformer class from the gplearn package for the genetic algorithm. Read more about this implementation here . Warning GFG can be slow for very large populations! Selecting useful features The FeatureSelector class provides tooling to select the relevant features from a dataset. It can be accessed from atom through the feature_selection method. The following strategies are implemented: univariate, PCA, SFM, RFE and RFECV. Univariate Univariate feature selection works by selecting the best features based on univariate statistical F-test. The test is provided via the solver parameter. It takes any function taking two arrays (X, y), and returning arrays (scores, p-values). Read more in sklearn's documentation . Principal Components Analysis Applying PCA will reduce the dimensionality of the dataset by maximizing the variance of each dimension. The new features will be called Component 0, Component 1, etc... The dataset will be scaled before applying the transformation (if it wasn't already). Read more in sklearn's documentation . Selection from model SFM uses an estimator with feature_importances_ or coef_ attributes to select the best features in a dataset based on importance weights. The estimator is provided through the solver parameter and can be already fitted. ATOM allows you to use one its pre-defined models , e.g. solver='RF' . If you didn't call the FeatureSeletor through atom , don't forget to indicate the estimator's task adding _class or _reg after the name, e.g. RF_class to use a random forest classifier. Read more in sklearn's documentation . Recursive feature elimination Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Note that, since RFE needs to fit the model again every iteration, this method can be fairly slow. RFECV applies the same algorithm as RFE but uses a cross-validated metric (under the scoring parameter, see RFECV ) to assess every step's performance. Also, where RFE returns the number of features selected by n_features , RFECV returns the number of features that achieved the optimal score on the specified metric. Note that this is not always equal to the amount specified by n_features . Read more in sklearn's documentation . Removing features with low variance Variance is the expectation of the squared deviation of a random variable from its mean. Features with low variance have many values repeated, which means the model will not learn much from them. FeatureSelector removes all features where the same value is repeated in at least max_frac_repeated fraction of the rows. The default option is to remove a feature if all values in it are the same. Read more in sklearn's documentation . Removing features with multi-collinearity Two features that are highly correlated are redundant, i.e. two will not contribute more to the model than only one of them. FeatureSelector will drop a feature that has a Pearson correlation coefficient larger than max_correlation with another feature. A correlation of 1 means the two columns are equal. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute. Tip Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead. Warning The RFE and RFECV strategies don't work when the solver is a CatBoost model due to incompatibility of the APIs. Training The training phase is where the models are fitted and evaluated. After this, the models are attached to the training instance and you can use the plotting and predicting methods. The pipeline applies the following steps iteratively for all models: The optimal hyperparameters are selected. The model is trained on the training set and evaluated on the test set. The bagging algorithm is applied. There are three approaches to run the training. Direct training: TrainerClassifier TrainerRegressor Training via successive halving : SuccessiveHalvingClassifier SuccessiveHavingRegressor Training via train sizing : TrainSizingClassifier TrainSizingRegressor The direct fashion repeats the aforementioned steps only once, while the other two approaches repeats them more than once. Every approach can be directly called from atom through the run , successive_halving and train_sizing methods respectively. Additional information: If an exception is encountered while fitting an estimator, the pipeline will automatically skip the model and jump to the next model and save the exception in the errors attribute. Note that in that case there will be no model for that estimator. When showing the final results, a !! indicates the highest score and a ~ indicates that the model is possibly overfitting (training set has a score at least 20% higher than the test set). The winning model (the one with the highest mean_bagging or metric_test ) will be attached to the winner attribute. Models ATOM provides 27 models for classification and regression tasks that can be used to fit the data in the pipeline. After fitting, every model is attached to the training instance as an attribute. Models are called through the models parameter using their corresponding acronym's, e.g. atom.run(models='RF') to run a Random forest model. Metric ATOM uses sklearn's scorers for model selection and evaluation. A scorer consists of a metric function and some parameters that define the scorer's properties such as it's a score or loss function or if the function needs probability estimates or rounded predictions (see make_scorer ). ATOM lets you define the scorer for the pipeline in three ways: The metric parameter is one of sklearn's predefined scorers (as string). The metric parameter is a score (or loss) function with signature metric(y, y_pred, **kwargs). In this case, use the greater_is_better , needs_proba and needs_threshold parameters to specify the scorer's properties. The metric parameter is a scorer object. Note that all scorers follow the convention that higher return values are better than lower return values. Thus metrics which measure the distance between the model and the data (i.e. loss functions), like max_error or mean_squared_error , will return the negated value of the metric. Custom scorer acronyms Since some of sklearn's scorers have quite long names and ATOM is all about lazy fast experimentation, the package provides acronyms for some of the most commonly used ones. These acronyms are case insensitive can be used for the metric parameter instead of the scorer's full name, e.g. atom.run('LR', metric='BA') will use balanced_accuracy . The available acronyms are: 'AP' for 'average_precision' 'BA' for 'balanced_accuracy' 'AUC' for 'roc_auc' 'EV' for 'explained_variance' 'ME' for 'max_error' 'MAE' for 'neg_mean_absolute_error' 'MSE' for 'neg_mean_squared_error' 'RMSE' for 'neg_root_mean_squared_error' 'MSLE' for 'neg_mean_squared_log_error' 'MEDAE' for 'neg_median_absolute_error' 'POISSON' for 'neg_mean_poisson_deviance' 'GAMMA' for 'neg_mean_gamma_deviance' Multi-metric runs Sometimes it is useful to measure the performance of the models in more than one way. ATOM lets you run the pipeline with multiple metrics at the same time. To do so, provide the metric parameter with a list of desired metrics, e.g. atom.run('LDA', metric=['r2', 'mse']) . If you provide metric functions, don't forget to also provide lists to the greater_is_better , needs_proba and needs_threshold parameters, where the n-th value in the list corresponds to the n-th function. If you leave them as a single value, that value will apply to every provided metric. When fitting multi-metric runs, the resulting scores will return a list of metrics. For example, if you provided three metrics to the pipeline, atom.knn.metric_bo could return [0.8734, 0.6672, 0.9001]. It is also important to note that only the first metric of a multi-metric run is used to evaluate every step of the bayesian optimization and to select the winning model. Tip Some plots let you choose which of the metrics to show using the metric parameter. Hyperparameter optimization In order to achieve maximum performance, we need to tune an estimator's hyperparameters before training it. ATOM provides hyperparameter tuning using a bayesian optimization (BO) approach implemented by skopt . The BO is optimized on the first metric provided with the metric parameter. Each step is either computed by cross-validation on the complete training set or by randomly splitting the training set every iteration into a (sub) training set and a validation set. This process can create some data leakage but ensures maximal use of the provided data. The test set, however, does not contain any leakage and will be used to determine the final score of every model. Note that, if the dataset is relatively small, the BO's best score can consistently be lower than the final score on the test set (despite the leakage) due to the considerable fewer instances on which it is trained. There are many possibilities to tune the BO to your liking. Use n_calls and n_initial_points to determine the number of iterations that are performed randomly at the start (exploration) and the number of iterations spent optimizing (exploitation). If n_calls is equal to n_initial_points , every iteration of the BO will select its hyperparameters randomly. This means the algorithm is technically performing a random search . Note The n_calls parameter includes the iterations in n_initial_points . Calling atom.run('LR', n_calls=20, n_intial_points=10) will run 20 iterations of which the first 10 are random. Other settings can be changed through the bo_params parameter, a dictionary where every key-value combination can be used to further customize the BO. By default, the hyperparameters and corresponding dimensions per model are predefined by ATOM. Use the dimensions key to use custom ones. Use an array for only one model and a dictionary with the model names as keys if there are multiple models in the pipeline. Note that the provided search space dimensions must be compliant with skopt's API. atom.run('LR', n_calls=10, bo_params={'dimensions': [Integer(100, 1000, name='max_iter')]}) The majority of skopt's callbacks to stop the optimizer early can be accessed through bo_params . You can include other callbacks using the callbacks key. atom.run('LR', n_calls=10, bo_params={'max_time': 1000, 'callbacks': custom_callback()}) You can also include other optimizer's parameters as key-value pairs. atom.run('LR', n_calls=10, bo_params={'acq_func': 'EI'}) Bagging After fitting the estimator, you can asses the robustness of the model using bootstrap aggregating (bagging). This technique creates several new data sets selecting random samples from the training set (with replacement) and evaluates them on the test set. This way we get a distribution of the performance of the model. The number of sets can be chosen through the bagging parameter. Tip Use the plot_bagging method to plot the bagging scores in a convenient boxplot. Early stopping XGBoost , LighGBM and CatBoost allow in-training evaluation. This means that the estimator is evaluated after every round of the training. Use the early_stopping key in bo_params to stop the training early if it didn't improve in the last early_stopping rounds. This can save the pipeline much time that would otherwise be wasted on an estimator that is unlikely to improve further. Note that this technique will be applied both during the BO and at the final fit on the complete training set. After fitting, the model will get the evals attribute, a dictionary of the train and test performances per round (also if early stopping wasn't applied). Tip Use the plot_evals method to plot the in-training evaluation on the train and test set. Successive halving Successive halving is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, we recommend only to use this technique with similar models, e.g. only using tree-based models. Use successive halving through the SuccessiveHalvingClassifier / SuccessiveHalvingRegressor classes or from atom via the successive_halving method. After running the pipeline, the results attribute will be multi-index, where the first index indicates the iteration and the second the model's acronym. Tip Use the plot_successive_halving method to see every model's performance per iteration of the successive halving. Train sizing When training models, there is usually a trade-off between model performance and computation time that is regulated by the number of samples in the training set. Train sizing can be used to create insights in this trade-off and help determine the optimal size of the training set, fitting the models multiple times, ever increasing the number of samples in the training set. Use train sizing through the TrainSizingClassifier / TrainSizingRegressor classes or from atom via the train_sizing method. The number of iterations and the number of samples per training can be specified with the train_sizes parameter. After running the pipeline, the results attribute will be multi-index, where the first index indicates the iteration and the second the model's acronym. Tip Use the plot_learning_curve method to see the model's performance per size of the training set. Predicting After running a successful pipeline, it is possible you would like to apply all used transformations onto new data, or make predictions using one of the trained models. Just like a sklearn estimator, you can call the prediction methods from a fitted training instance, e.g. atom.predict(X) . Calling the method without specifying a model will use the winning model in the pipeline (under attribute winner ). To use a different model, simply call the method from a model , e.g. atom.KNN.predict(X) . If called from atom , the prediction methods will transform the provided data through all the transformers in the pipeline before making the predictions. By default, this excludes outlier handling and balancing the dataset since these steps should only be applied on the training set. Use the method's kwargs to select which transformations to use in every call. The available prediction methods are a selection of the most common methods for estimators in sklearn's API: transform Transform new data through all the pre-processing steps in the pipeline. predict Transform the data and make predictions on new data. predict_proba Transform the data and make probabilistic predictions on new data. predict_log_proba Transform the data and make logarithmic probability predictions on new data. decision_function Transform the data and evaluate the decision function on new data. score Transform the data and return the model's score on new data. Except for transform, the prediction methods can be calculated on the train and test set. You can access them through the model 's prediction attributes , e.g. atom.mnb.predict_train or atom.mnb.predict_test . Keep in mind that the results are not calculated until the attribute is called for the first time. This mechanism avoids having to calculate attributes that are never used, saving time and memory. Note Many of the plots use the prediction attributes. This can considerably increase the size of the class for large datasets. Use the reset_prediction_attributes method if you need to free some memory! Plots After fitting the models to the data, it's time to analyze the results. ATOM provides many plotting methods to compare the model performances. Descriptions and examples can be found in the API section. ATOM uses the packages matplotlib , seaborn and shap for plotting. The plot methods can be called from a training directly, e.g. atom.plot_roc() , or from one of the models , e.g. atom.LGB.plot_roc() . If called from training , it will make the plot for all models in the pipeline. This can be useful to compare the results of multiple models. If called from a model , it will make the plot for only that model. Use this option if you want information just for that specific model or to make a plot less crowded. Parameters Apart from the plot-specific parameters they may have, all plots have four parameters in common: The title parameter allows you to add a custom title to the plot. The figsize parameter adjust the plot's size. The filename parameter is used to save the plot. The display parameter determines whether the plot is rendered. Aesthetics The plot aesthetics can be customized using the plot attributes, e.g. atom.style = 'white' . These attributes can be called from any instance with plotting methods. Note that the plot attributes are attached to the class and not the instance. This means that changing the attribute will also change it for all other instances in the module. ATOM's default values are: style: 'darkgrid' palette: 'GnBu_r_d' title_fontsize: 20 label_fontsize: 16 tick_fontsize: 12 SHAP The SHAP (SHapley Additive exPlanations) python package uses a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. ATOM implements methods to plot 4 of shap's plotting functions directly from its API. The explainer will be chosen automatically based on the model's type. For kernelExplainer, the data used to estimate the expected values is the complete training set when <100 rows, else its summarized with a set of 10 weighted K-means, each weighted by the number of points they represent. The four plots are: force_plot , dependence_plot , summary_plot and decision_plot . Since the plots are not made by ATOM, we can't draw multiple models in the same figure. Selecting more than one model will raise an exception. To avoid this, call the plot from a model , e.g. atom.xgb.force_plot() . Note You can recognize the SHAP plots by the fact that they end (instead of start) with plot. Available plots A list of available plots can be find hereunder. Note that not all plots can be called from every class and that their availability can depend on the task at hand. plot_correlation Plot the data's correlation matrix. plot_pipeline Plot a diagram of every estimator in atom's pipeline. plot_pca Plot the explained variance ratio vs the number of components. plot_components Plot the explained variance ratio per components. plot_rfecv Plot the RFECV results. plot_successive_halving Plot of the models' scores per iteration of the successive halving. plot_learning_curve Plot the model's learning curve. plot_bagging Plot a boxplot of the bagging's results. plot_bo Plot the bayesian optimization scoring. plot_evals Plot evaluation curves for the train and test set. plot_roc Plot the Receiver Operating Characteristics curve. plot_prc Plot the precision-recall curve. plot_permutation_importance Plot the feature permutation importance of models. plot_feature_importance Plot a tree-based model's feature importance. plot_partial_dependence Plot the partial dependence of features. plot_errors Plot a model's prediction errors. plot_residuals Plot a model's residuals. plot_confusion_matrix Plot a model's confusion matrix. plot_threshold Plot a metric's performance against threshold values. plot_probabilities Plot the probability distribution of the categories in the target column. plot_calibration Plot the calibration curve for a binary classifier. plot_gains Plot the cumulative gains curve. plot_lift Plot the lift curve. force_plot Plot SHAP's force plot. dependence_plot Plot SHAP's dependence plot. summary_plot Plot SHAP's summary plot. decision_plot Plot SHAP's decision plot.","title":"User guide"},{"location":"user_guide/#introduction","text":"There is no magic formula in data science that can tell us which type of machine learning algorithm will perform best for a specific use-case. Different models are better suited for different types of data and different problems. At best, you can follow some rough guide on how to approach problems with regard to which model to try on your data, but these are often more confusing than helpful. Best practices tell us to start with a simple model (e.g. linear regression) and build up to more complicated models (e.g. logistic regression -> random forest -> multilayer perceptron) if you are not satisfied with the results. Unfortunately, different models require different data cleaning steps, different type/amount of features, tuning a new set of hyperparameters, etc. Refactoring the code for this purpose can be quite boring and time consuming. Because of this, many data scientists end up just using the model best known to them and fine-tuning this particular model without ever trying different ones. This can result in poor performance (because the model is just not the right one for the task) or in poor time management (because you could have achieved a similar performance with a simpler/faster model). ATOM is here to help us solve these issues. With just a few lines of code, you can perform basic data cleaning steps, select relevant features and compare the performance of multiple models on a given dataset. ATOM should be able to provide quick insights on which algorithms perform best for the task at hand and provide an indication of the feasibility of the ML solution. It is important to realize that ATOM is not here to replace all the work a data scientist has to do before getting his model into production. ATOM doesn't spit out production-ready models just by tuning some parameters in its API. After helping you to determine the right model, you will most probably need to fine-tune it using use-case specific features and data cleaning steps in order to achieve maximum performance. So, this sounds a bit like AutoML, how is ATOM different than auto-sklearn or TPOT ? Well, ATOM does AutoML in the sense that it helps you find the best model for a specific task, but contrary to the aforementioned packages, it does not actively search for the best model. It simply runs all of them and let you pick the one that you think suites you best. AutoML packages are often black boxes: if you provide data, it will magically return a working model. Although it works great, they often produce complicated pipelines with low explainability, hard to sell to the business. In this, ATOM excels. Every step of the pipeline is accounted for, and using the provided plotting methods, it\u2019s easy to demonstrate why a model is better/worse than the other.","title":"Introduction"},{"location":"user_guide/#nomenclature","text":"In this documentation we will consistently use terms to refer to certain concepts related to the ATOM package. ATOM : Refers to this package. task : Refers to one of the three supervised machine learning approaches that ATOM supports: binary classification multiclass classification regression category : Refers to one of the unique values in a column, i.e. a binary classifier has 2 categories in the target column. missing values : Refers to None , NaN and inf values. categorical columns : Refers to all columns with dtype.kind not in ifu . atom : Refers to an ATOMClassifier or ATOMRegressor instance (note that all examples use it as variable name for the instance). model : Refers to one of the model instances. estimator : Actual estimator corresponding to a model. Implemented by an external package. BO : Bayesian optimization algorithm used for hyperparameter optimization. training : Refers to an instance of one of the classes that train and evaluate the models. The classes are: ATOMClassifier ATOMRegressor TrainerClassifier TrainerRegressor SuccessiveHalvingClassifier SuccessiveHavingRegressor TrainSizingClassifier TrainSizingRegressor Note Note that atom instances are also training instances!","title":"Nomenclature"},{"location":"user_guide/#first-steps","text":"You can quickly install atom using pip or conda , see the installation guide . ATOM contains a variety of classes to perform data cleaning, feature engineering, model training and much more. The easiest way to use all these classes on the same dataset is through one of the main classes: ATOMClassifier for binary or multiclass classification tasks. ATOMRegressor for regression tasks. These two classes are convenient wrappers for all the possibilities this package provides. Like a Pipeline , they assemble several steps that can be cross-validated together while setting different parameters. There are some important differences with sklearn's API: atom is initialized with the data you want to manipulate. This data can be accessed at any moment through atom 's data attributes . The classes in ATOM's API are reached through atom 's methods. For example, calling the encode method, will initialize an Encoder instance, fit it on the training set and transform the whole dataset. The transformations are applied immediately after calling the method (there is no fit method). This approach gives the user a clearer overview and more control over every step in the pipeline. The pipeline does not have to end with an estimator. ATOM can be just for data cleaning or feature engineering purposes only. Let's get started with an example! First, initialize atom and provide it the data you want to use. atom = ATOMClassifier(X, y) Apply data cleaning methods through the class. For example, calling the impute method will handle all missing values in the dataset. atom.impute(strat_num='median', strat_cat='most_frequent', min_frac_rows=0.1) Select the best hyperparameters and fit a Random Forest and AdaBoost model. atom.run(['RF', 'AdaB'], metric='accuracy', n_calls=25, n_initial_points=10) Analyze the results: atom.feature_importances(show=10, filename='feature_importance_plot') atom.plot_prc(title='Precision-recall curve comparison plot')","title":"First steps"},{"location":"user_guide/#data-cleaning","text":"More often than not, you need to do some data cleaning before fitting your dataset to a model. Usually, this involves importing different libraries and writing many lines of code. Since ATOM is all about fast exploration and experimentation, it provides various data cleaning classes to apply the most common transformations fast and easy.","title":"Data cleaning"},{"location":"user_guide/#scaling-the-feature-set","text":"Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance). The Scaler class scales data to mean=0 and std=1. It can be accessed from atom through the scale method.","title":"Scaling the feature set"},{"location":"user_guide/#standard-data-cleaning","text":"There are many data cleaning steps that are useful to perform on any dataset before modelling. These are general rules that apply on every use-case and every task. The StandardCleaner class is a convenient tool to apply such steps. It is automatically called when initializing atom . Use the class' parameters to choose which transformations to perform. The available steps are: Remove columns with prohibited data types. Strip categorical features from white spaces. Remove categorical columns with maximal cardinality. Remove columns with minimum cardinality. Remove rows with missing values in the target column. Label-encode the target column.","title":"Standard data cleaning"},{"location":"user_guide/#imputing-missing-values","text":"For various reasons, many real world datasets contain missing values, often encoded as blanks, NaNs or other placeholders. Such datasets however are incompatible with ATOM's models which assume that all values in an array are numerical, and that all have and hold meaning. The Imputer class handles missing values in the dataset by either dropping or imputing the value. It can be accessed from atom through the impute method. Tip Use atom 's missing attribute for an overview of the missing values in the dataset.","title":"Imputing missing values"},{"location":"user_guide/#encoding-categorical-features","text":"Many datasets will contain categorical features. Their variables are typically stored as text values which represent various traits. Some examples include color (\u201cRed\u201d, \u201cYellow\u201d, \u201cBlue\u201d), size (\u201cSmall\u201d, \u201cMedium\u201d, \u201cLarge\u201d) or geographic designations (city or country). Regardless of what the value is used for, the challenge is determining how to use this data in the analysis. ATOM's models don't support direct manipulation of this kind of data. Use the Encoder class to encode categorical features to numerical values. It can be accessed from atom through the encode method. Tip Use atom 's categorical attribute for a list of the categorical columns in the dataset.","title":"Encoding categorical features"},{"location":"user_guide/#handling-outliers","text":"When modeling, it is important to clean the data sample to ensure that the observations best represent the problem. Sometimes a dataset can contain extreme values that are outside the range of what is expected and unlike the other data. These are called outliers. Often, machine learning modeling and model skill in general can be improved by understanding and even removing these outlier values. The Outliers class can drop or impute outliers in the dataset. It can be accessed from atom through the outliers method.","title":"Handling outliers"},{"location":"user_guide/#balancing-the-data","text":"One of the common issues found in datasets that are used for classification is imbalanced classes. Data imbalance usually reflects an unequal distribution of classes within a dataset. For example, in a credit card fraud detection dataset, most of the transactions are non-fraud and a very few cases are fraud. This leaves us with a very unbalanced ratio of fraud vs non-fraud cases. The Balancer class can oversample the minority category or undersample the majority category. It can be accessed from atom through the balance method.","title":"Balancing the data"},{"location":"user_guide/#feature-engineering","text":"\"Applied machine learning\" is basically feature engineering. ~ Andrew Ng. Feature engineering is the process of creating new features from the existing ones, in order to capture relationships with the target column that the first set of features didn't had on their own. This process is very important to improve the performance of machine learning algorithms. Although feature engineering works best when the data scientist applies use-case specific transformations, there are ways to do this in an automated manner, without prior domain knowledge. One of the problems of creating new features without human expert intervention, is that many of the newly created features can be useless, i.e. they do not help the algorithm to make better predictions. Even worse, having useless features can drop your performance. To avoid this, we perform feature selection, a process in which we select the relevant features in the dataset. See here an example.","title":"Feature engineering"},{"location":"user_guide/#generating-new-features","text":"The FeatureGenerator class creates new non-linear features based on the original feature set. It can be accessed from atom through the feature_generation method. You can choose between two strategies: Deep Feature Synthesis and Genetic Feature Generation. Deep Feature Synthesis Deep feature synthesis (DFS) applies the selected operators on the features in the dataset. For example, if the operator is 'log', it will create the new feature LOG(old_feature) and if the operator is 'mul', it will create the new feature old_feature_1 x old_feature_2 . The operators can be chosen through the operators parameter. Available options are: add: Sum two features together. sub: Subtract two features from each other. mul: Multiply two features with each other. div: Divide two features with each other. srqt: Take the square root of a feature. log: Take the logarithm of a feature. sin: Calculate the sine of a feature. cos: Calculate the cosine of a feature. tan: Calculate the tangent of a feature. ATOM's implementation of DFS uses the featuretools package. Tip DFS can create many new features and not all of them will be useful. Use FeatureSelector to reduce the number of features! Warning Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom 's missing property. Warning When using DFS with n_jobs>1 , make sure to protect your code with if __name__ == \"__main__\" . Featuretools uses dask , which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task. Genetic Feature Generation Genetic feature generation (GFG) uses genetic programming , a branch of evolutionary programming, to determine which features are successful and create new ones based on those. Where DFS' method can be seen as some kind of \"brute force\" for feature engineering, GFG tries to improve its features with every generation of the algorithm. GFG uses the same operators as DFS, but instead of only applying the transformations once, it evolves them further, creating complicated non-linear combinations of features with many transformations. The new features are given the name Feature N for the N-th feature. You can access the genetic feature's fitness and description (how they are calculated) through the genetic_features attribute. ATOM uses the SymbolicTransformer class from the gplearn package for the genetic algorithm. Read more about this implementation here . Warning GFG can be slow for very large populations!","title":"Generating new features"},{"location":"user_guide/#selecting-useful-features","text":"The FeatureSelector class provides tooling to select the relevant features from a dataset. It can be accessed from atom through the feature_selection method. The following strategies are implemented: univariate, PCA, SFM, RFE and RFECV. Univariate Univariate feature selection works by selecting the best features based on univariate statistical F-test. The test is provided via the solver parameter. It takes any function taking two arrays (X, y), and returning arrays (scores, p-values). Read more in sklearn's documentation . Principal Components Analysis Applying PCA will reduce the dimensionality of the dataset by maximizing the variance of each dimension. The new features will be called Component 0, Component 1, etc... The dataset will be scaled before applying the transformation (if it wasn't already). Read more in sklearn's documentation . Selection from model SFM uses an estimator with feature_importances_ or coef_ attributes to select the best features in a dataset based on importance weights. The estimator is provided through the solver parameter and can be already fitted. ATOM allows you to use one its pre-defined models , e.g. solver='RF' . If you didn't call the FeatureSeletor through atom , don't forget to indicate the estimator's task adding _class or _reg after the name, e.g. RF_class to use a random forest classifier. Read more in sklearn's documentation . Recursive feature elimination Select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained either through a coef_ attribute or through a feature_importances_ attribute. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached. Note that, since RFE needs to fit the model again every iteration, this method can be fairly slow. RFECV applies the same algorithm as RFE but uses a cross-validated metric (under the scoring parameter, see RFECV ) to assess every step's performance. Also, where RFE returns the number of features selected by n_features , RFECV returns the number of features that achieved the optimal score on the specified metric. Note that this is not always equal to the amount specified by n_features . Read more in sklearn's documentation . Removing features with low variance Variance is the expectation of the squared deviation of a random variable from its mean. Features with low variance have many values repeated, which means the model will not learn much from them. FeatureSelector removes all features where the same value is repeated in at least max_frac_repeated fraction of the rows. The default option is to remove a feature if all values in it are the same. Read more in sklearn's documentation . Removing features with multi-collinearity Two features that are highly correlated are redundant, i.e. two will not contribute more to the model than only one of them. FeatureSelector will drop a feature that has a Pearson correlation coefficient larger than max_correlation with another feature. A correlation of 1 means the two columns are equal. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute. Tip Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead. Warning The RFE and RFECV strategies don't work when the solver is a CatBoost model due to incompatibility of the APIs.","title":"Selecting useful features"},{"location":"user_guide/#training","text":"The training phase is where the models are fitted and evaluated. After this, the models are attached to the training instance and you can use the plotting and predicting methods. The pipeline applies the following steps iteratively for all models: The optimal hyperparameters are selected. The model is trained on the training set and evaluated on the test set. The bagging algorithm is applied. There are three approaches to run the training. Direct training: TrainerClassifier TrainerRegressor Training via successive halving : SuccessiveHalvingClassifier SuccessiveHavingRegressor Training via train sizing : TrainSizingClassifier TrainSizingRegressor The direct fashion repeats the aforementioned steps only once, while the other two approaches repeats them more than once. Every approach can be directly called from atom through the run , successive_halving and train_sizing methods respectively. Additional information: If an exception is encountered while fitting an estimator, the pipeline will automatically skip the model and jump to the next model and save the exception in the errors attribute. Note that in that case there will be no model for that estimator. When showing the final results, a !! indicates the highest score and a ~ indicates that the model is possibly overfitting (training set has a score at least 20% higher than the test set). The winning model (the one with the highest mean_bagging or metric_test ) will be attached to the winner attribute.","title":"Training"},{"location":"user_guide/#models","text":"ATOM provides 27 models for classification and regression tasks that can be used to fit the data in the pipeline. After fitting, every model is attached to the training instance as an attribute. Models are called through the models parameter using their corresponding acronym's, e.g. atom.run(models='RF') to run a Random forest model.","title":"Models"},{"location":"user_guide/#metric","text":"ATOM uses sklearn's scorers for model selection and evaluation. A scorer consists of a metric function and some parameters that define the scorer's properties such as it's a score or loss function or if the function needs probability estimates or rounded predictions (see make_scorer ). ATOM lets you define the scorer for the pipeline in three ways: The metric parameter is one of sklearn's predefined scorers (as string). The metric parameter is a score (or loss) function with signature metric(y, y_pred, **kwargs). In this case, use the greater_is_better , needs_proba and needs_threshold parameters to specify the scorer's properties. The metric parameter is a scorer object. Note that all scorers follow the convention that higher return values are better than lower return values. Thus metrics which measure the distance between the model and the data (i.e. loss functions), like max_error or mean_squared_error , will return the negated value of the metric. Custom scorer acronyms Since some of sklearn's scorers have quite long names and ATOM is all about lazy fast experimentation, the package provides acronyms for some of the most commonly used ones. These acronyms are case insensitive can be used for the metric parameter instead of the scorer's full name, e.g. atom.run('LR', metric='BA') will use balanced_accuracy . The available acronyms are: 'AP' for 'average_precision' 'BA' for 'balanced_accuracy' 'AUC' for 'roc_auc' 'EV' for 'explained_variance' 'ME' for 'max_error' 'MAE' for 'neg_mean_absolute_error' 'MSE' for 'neg_mean_squared_error' 'RMSE' for 'neg_root_mean_squared_error' 'MSLE' for 'neg_mean_squared_log_error' 'MEDAE' for 'neg_median_absolute_error' 'POISSON' for 'neg_mean_poisson_deviance' 'GAMMA' for 'neg_mean_gamma_deviance' Multi-metric runs Sometimes it is useful to measure the performance of the models in more than one way. ATOM lets you run the pipeline with multiple metrics at the same time. To do so, provide the metric parameter with a list of desired metrics, e.g. atom.run('LDA', metric=['r2', 'mse']) . If you provide metric functions, don't forget to also provide lists to the greater_is_better , needs_proba and needs_threshold parameters, where the n-th value in the list corresponds to the n-th function. If you leave them as a single value, that value will apply to every provided metric. When fitting multi-metric runs, the resulting scores will return a list of metrics. For example, if you provided three metrics to the pipeline, atom.knn.metric_bo could return [0.8734, 0.6672, 0.9001]. It is also important to note that only the first metric of a multi-metric run is used to evaluate every step of the bayesian optimization and to select the winning model. Tip Some plots let you choose which of the metrics to show using the metric parameter.","title":"Metric"},{"location":"user_guide/#hyperparameter-optimization","text":"In order to achieve maximum performance, we need to tune an estimator's hyperparameters before training it. ATOM provides hyperparameter tuning using a bayesian optimization (BO) approach implemented by skopt . The BO is optimized on the first metric provided with the metric parameter. Each step is either computed by cross-validation on the complete training set or by randomly splitting the training set every iteration into a (sub) training set and a validation set. This process can create some data leakage but ensures maximal use of the provided data. The test set, however, does not contain any leakage and will be used to determine the final score of every model. Note that, if the dataset is relatively small, the BO's best score can consistently be lower than the final score on the test set (despite the leakage) due to the considerable fewer instances on which it is trained. There are many possibilities to tune the BO to your liking. Use n_calls and n_initial_points to determine the number of iterations that are performed randomly at the start (exploration) and the number of iterations spent optimizing (exploitation). If n_calls is equal to n_initial_points , every iteration of the BO will select its hyperparameters randomly. This means the algorithm is technically performing a random search . Note The n_calls parameter includes the iterations in n_initial_points . Calling atom.run('LR', n_calls=20, n_intial_points=10) will run 20 iterations of which the first 10 are random. Other settings can be changed through the bo_params parameter, a dictionary where every key-value combination can be used to further customize the BO. By default, the hyperparameters and corresponding dimensions per model are predefined by ATOM. Use the dimensions key to use custom ones. Use an array for only one model and a dictionary with the model names as keys if there are multiple models in the pipeline. Note that the provided search space dimensions must be compliant with skopt's API. atom.run('LR', n_calls=10, bo_params={'dimensions': [Integer(100, 1000, name='max_iter')]}) The majority of skopt's callbacks to stop the optimizer early can be accessed through bo_params . You can include other callbacks using the callbacks key. atom.run('LR', n_calls=10, bo_params={'max_time': 1000, 'callbacks': custom_callback()}) You can also include other optimizer's parameters as key-value pairs. atom.run('LR', n_calls=10, bo_params={'acq_func': 'EI'})","title":"Hyperparameter optimization"},{"location":"user_guide/#bagging","text":"After fitting the estimator, you can asses the robustness of the model using bootstrap aggregating (bagging). This technique creates several new data sets selecting random samples from the training set (with replacement) and evaluates them on the test set. This way we get a distribution of the performance of the model. The number of sets can be chosen through the bagging parameter. Tip Use the plot_bagging method to plot the bagging scores in a convenient boxplot.","title":"Bagging"},{"location":"user_guide/#early-stopping","text":"XGBoost , LighGBM and CatBoost allow in-training evaluation. This means that the estimator is evaluated after every round of the training. Use the early_stopping key in bo_params to stop the training early if it didn't improve in the last early_stopping rounds. This can save the pipeline much time that would otherwise be wasted on an estimator that is unlikely to improve further. Note that this technique will be applied both during the BO and at the final fit on the complete training set. After fitting, the model will get the evals attribute, a dictionary of the train and test performances per round (also if early stopping wasn't applied). Tip Use the plot_evals method to plot the in-training evaluation on the train and test set.","title":"Early stopping"},{"location":"user_guide/#successive-halving","text":"Successive halving is a bandit-based algorithm that fits N models to 1/N of the data. The best half are selected to go to the next iteration where the process is repeated. This continues until only one model remains, which is fitted on the complete dataset. Beware that a model's performance can depend greatly on the amount of data on which it is trained. For this reason, we recommend only to use this technique with similar models, e.g. only using tree-based models. Use successive halving through the SuccessiveHalvingClassifier / SuccessiveHalvingRegressor classes or from atom via the successive_halving method. After running the pipeline, the results attribute will be multi-index, where the first index indicates the iteration and the second the model's acronym. Tip Use the plot_successive_halving method to see every model's performance per iteration of the successive halving.","title":"Successive halving"},{"location":"user_guide/#train-sizing","text":"When training models, there is usually a trade-off between model performance and computation time that is regulated by the number of samples in the training set. Train sizing can be used to create insights in this trade-off and help determine the optimal size of the training set, fitting the models multiple times, ever increasing the number of samples in the training set. Use train sizing through the TrainSizingClassifier / TrainSizingRegressor classes or from atom via the train_sizing method. The number of iterations and the number of samples per training can be specified with the train_sizes parameter. After running the pipeline, the results attribute will be multi-index, where the first index indicates the iteration and the second the model's acronym. Tip Use the plot_learning_curve method to see the model's performance per size of the training set.","title":"Train sizing"},{"location":"user_guide/#predicting","text":"After running a successful pipeline, it is possible you would like to apply all used transformations onto new data, or make predictions using one of the trained models. Just like a sklearn estimator, you can call the prediction methods from a fitted training instance, e.g. atom.predict(X) . Calling the method without specifying a model will use the winning model in the pipeline (under attribute winner ). To use a different model, simply call the method from a model , e.g. atom.KNN.predict(X) . If called from atom , the prediction methods will transform the provided data through all the transformers in the pipeline before making the predictions. By default, this excludes outlier handling and balancing the dataset since these steps should only be applied on the training set. Use the method's kwargs to select which transformations to use in every call. The available prediction methods are a selection of the most common methods for estimators in sklearn's API: transform Transform new data through all the pre-processing steps in the pipeline. predict Transform the data and make predictions on new data. predict_proba Transform the data and make probabilistic predictions on new data. predict_log_proba Transform the data and make logarithmic probability predictions on new data. decision_function Transform the data and evaluate the decision function on new data. score Transform the data and return the model's score on new data. Except for transform, the prediction methods can be calculated on the train and test set. You can access them through the model 's prediction attributes , e.g. atom.mnb.predict_train or atom.mnb.predict_test . Keep in mind that the results are not calculated until the attribute is called for the first time. This mechanism avoids having to calculate attributes that are never used, saving time and memory. Note Many of the plots use the prediction attributes. This can considerably increase the size of the class for large datasets. Use the reset_prediction_attributes method if you need to free some memory!","title":"Predicting"},{"location":"user_guide/#plots","text":"After fitting the models to the data, it's time to analyze the results. ATOM provides many plotting methods to compare the model performances. Descriptions and examples can be found in the API section. ATOM uses the packages matplotlib , seaborn and shap for plotting. The plot methods can be called from a training directly, e.g. atom.plot_roc() , or from one of the models , e.g. atom.LGB.plot_roc() . If called from training , it will make the plot for all models in the pipeline. This can be useful to compare the results of multiple models. If called from a model , it will make the plot for only that model. Use this option if you want information just for that specific model or to make a plot less crowded.","title":"Plots"},{"location":"user_guide/#parameters","text":"Apart from the plot-specific parameters they may have, all plots have four parameters in common: The title parameter allows you to add a custom title to the plot. The figsize parameter adjust the plot's size. The filename parameter is used to save the plot. The display parameter determines whether the plot is rendered.","title":"Parameters"},{"location":"user_guide/#aesthetics","text":"The plot aesthetics can be customized using the plot attributes, e.g. atom.style = 'white' . These attributes can be called from any instance with plotting methods. Note that the plot attributes are attached to the class and not the instance. This means that changing the attribute will also change it for all other instances in the module. ATOM's default values are: style: 'darkgrid' palette: 'GnBu_r_d' title_fontsize: 20 label_fontsize: 16 tick_fontsize: 12","title":"Aesthetics"},{"location":"user_guide/#shap","text":"The SHAP (SHapley Additive exPlanations) python package uses a game theoretic approach to explain the output of any machine learning model. It connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions. ATOM implements methods to plot 4 of shap's plotting functions directly from its API. The explainer will be chosen automatically based on the model's type. For kernelExplainer, the data used to estimate the expected values is the complete training set when <100 rows, else its summarized with a set of 10 weighted K-means, each weighted by the number of points they represent. The four plots are: force_plot , dependence_plot , summary_plot and decision_plot . Since the plots are not made by ATOM, we can't draw multiple models in the same figure. Selecting more than one model will raise an exception. To avoid this, call the plot from a model , e.g. atom.xgb.force_plot() . Note You can recognize the SHAP plots by the fact that they end (instead of start) with plot.","title":"SHAP"},{"location":"user_guide/#available-plots","text":"A list of available plots can be find hereunder. Note that not all plots can be called from every class and that their availability can depend on the task at hand. plot_correlation Plot the data's correlation matrix. plot_pipeline Plot a diagram of every estimator in atom's pipeline. plot_pca Plot the explained variance ratio vs the number of components. plot_components Plot the explained variance ratio per components. plot_rfecv Plot the RFECV results. plot_successive_halving Plot of the models' scores per iteration of the successive halving. plot_learning_curve Plot the model's learning curve. plot_bagging Plot a boxplot of the bagging's results. plot_bo Plot the bayesian optimization scoring. plot_evals Plot evaluation curves for the train and test set. plot_roc Plot the Receiver Operating Characteristics curve. plot_prc Plot the precision-recall curve. plot_permutation_importance Plot the feature permutation importance of models. plot_feature_importance Plot a tree-based model's feature importance. plot_partial_dependence Plot the partial dependence of features. plot_errors Plot a model's prediction errors. plot_residuals Plot a model's residuals. plot_confusion_matrix Plot a model's confusion matrix. plot_threshold Plot a metric's performance against threshold values. plot_probabilities Plot the probability distribution of the categories in the target column. plot_calibration Plot the calibration curve for a binary classifier. plot_gains Plot the cumulative gains curve. plot_lift Plot the lift curve. force_plot Plot SHAP's force plot. dependence_plot Plot SHAP's dependence plot. summary_plot Plot SHAP's summary plot. decision_plot Plot SHAP's decision plot.","title":"Available plots"},{"location":"API/models/","text":"Models After fitting, every model class is attached to the training instance as an attribute. We refer to these \"subclasses\" as models (see the nomenclature ). The classes contain a variety of attributes and methods to help you understand how the underlying estimator performed. They can be accessed using the models' acronyms , e.g. atom.LGB to access LightGBM's model . The available models and their corresponding acronyms are: 'GP' for Gaussian Process 'GNB' for Gaussian Naive Bayes 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'OLS' for Ordinary Least Squares 'Ridge' for Ridge classification/regression 'Lasso' for Lasso regression 'EN' for Elastic Net regression 'BR' for Bayesian Regression 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for Decision Tree 'Bag' for Bagging 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost 'LGB' for LightGBM 'CatB' for CatBoost 'lSVM' for Linear-SVM 'kSVM' for Kernel-SVM 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron Tip You can also use lowercase to call the models , e.g. atom.lgb.plot_roc() . Warning The models should not be initialized by the user! Only use them through the training instances. Attributes Data attributes You can use the same data attributes as the training instances to check the dataset that was used to fit a particular model. These can differ from each other if the model needs scaled features and the data wasn't already scaled. Note that, unlike with the training instances, the data can not be updated from the models (i.e. the data attributes have no @setter ). Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. Utility attributes Attributes: bo: pd.DataFrame Dataframe containing the information of every step taken by the BO. Columns include: 'params': Parameters used in the model. 'model': Model used for this iteration (fitted on last cross-validation). 'score': Score of the chosen metric. List of scores for multi-metric. 'time_iteration': Time spent on this iteration. 'time': Total ime spent since the start of the BO. best_params: dict Dictionary of the best combination of hyperparameters found by the BO. estimator: class Estimator instance with the best combination of hyperparameters fitted on the complete training set. time_bo: str Time it took to run the bayesian optimization algorithm. metric_bo: float or list Best metric score(s) on the BO. time_fit: str Time it took to train the model on the complete training set and calculate the metric(s) on the test set. metric_train: float or list Metric score(s) on the training set. metric_test: float or list Metric score(s) on the test set. evals: dict Dictionary of the metric calculated during training. The metric is provided by the model's package and is different for every model and every task. Only for models that allow in-training evaluation (XGB, LGB, CatB). Available keys: 'metric': Name of the metric. 'train': List of scores calculated on the training set. 'test': List of scores calculated on the test set. metric_bagging: list Array of the bagging's results. mean_bagging: float Mean of the bagging's results. std_bagging: float Standard deviation of the bagging's results. Prediction attributes The prediction attributes are not calculated until the attribute is called for the first time. This mechanism avoids having to calculate attributes that are never used, saving time and memory. Prediction attributes: predict_train: np.ndarray Predictions of the model on the training set. predict_test: np.ndarray Predictions of the model on the test set. predict_proba_train: np.ndarray Predicted probabilities of the model on the training set. Only for estimators with a predict_proba method. predict_proba_test: np.ndarray Predicted probabilities of the model on the test set. Only for estimators with a predict_proba method. predict_log_proba_train: np.ndarray Predicted log probabilities of the model on the training set. Only for estimators with a predict_proba method. predict_log_proba_test: np.ndarray Predicted log probabilities of the model on the test set. Only for estimators with a predict_proba method. decision_function_train: np.ndarray Decision function scores on the training set. Only for estimators with a decision_function method. decision_function_test: np.ndarray Decision function scores on the test set. Only for estimators with a decision_function method. score_train: np.float64 Model's score on the training set. score_test: np.float64 Model's score on the test set. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods The majority of the plots and prediction methods can be called directly from the models , e.g. atom.xgb.plot_roc() or atom.xgb.predict_proba(X) . The remaining utility methods can be found hereunder: calibrate Calibrate the model. reset_prediction_attributes Clear all the prediction attributes. scoring Get the scoring of a specific metric on the test set. save_estimator Save the estimator to a pickle file. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done using the CalibratedClassifierCV class from sklearn. The estimator will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method reset_prediction_attributes () [source] Clear all the prediction attributes. Use this method to free some memory before saving the class. method scoring (metric=None, dataset='test') [source] Get the scoring of a specific metric on the test set. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method save_estimator (filename=None) [source] Save the estimator to a pickle file. Parameters: filename: str or None, optional (default=None) Name of the file to save. If None or 'auto', the default name is used.","title":"Models"},{"location":"API/models/#models","text":"After fitting, every model class is attached to the training instance as an attribute. We refer to these \"subclasses\" as models (see the nomenclature ). The classes contain a variety of attributes and methods to help you understand how the underlying estimator performed. They can be accessed using the models' acronyms , e.g. atom.LGB to access LightGBM's model . The available models and their corresponding acronyms are: 'GP' for Gaussian Process 'GNB' for Gaussian Naive Bayes 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'OLS' for Ordinary Least Squares 'Ridge' for Ridge classification/regression 'Lasso' for Lasso regression 'EN' for Elastic Net regression 'BR' for Bayesian Regression 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for Decision Tree 'Bag' for Bagging 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost 'LGB' for LightGBM 'CatB' for CatBoost 'lSVM' for Linear-SVM 'kSVM' for Kernel-SVM 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron Tip You can also use lowercase to call the models , e.g. atom.lgb.plot_roc() . Warning The models should not be initialized by the user! Only use them through the training instances.","title":"Models"},{"location":"API/models/#attributes","text":"","title":"Attributes"},{"location":"API/models/#data-attributes","text":"You can use the same data attributes as the training instances to check the dataset that was used to fit a particular model. These can differ from each other if the model needs scaled features and the data wasn't already scaled. Note that, unlike with the training instances, the data can not be updated from the models (i.e. the data attributes have no @setter ). Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column.","title":"Data attributes"},{"location":"API/models/#utility-attributes","text":"Attributes: bo: pd.DataFrame Dataframe containing the information of every step taken by the BO. Columns include: 'params': Parameters used in the model. 'model': Model used for this iteration (fitted on last cross-validation). 'score': Score of the chosen metric. List of scores for multi-metric. 'time_iteration': Time spent on this iteration. 'time': Total ime spent since the start of the BO. best_params: dict Dictionary of the best combination of hyperparameters found by the BO. estimator: class Estimator instance with the best combination of hyperparameters fitted on the complete training set. time_bo: str Time it took to run the bayesian optimization algorithm. metric_bo: float or list Best metric score(s) on the BO. time_fit: str Time it took to train the model on the complete training set and calculate the metric(s) on the test set. metric_train: float or list Metric score(s) on the training set. metric_test: float or list Metric score(s) on the test set. evals: dict Dictionary of the metric calculated during training. The metric is provided by the model's package and is different for every model and every task. Only for models that allow in-training evaluation (XGB, LGB, CatB). Available keys: 'metric': Name of the metric. 'train': List of scores calculated on the training set. 'test': List of scores calculated on the test set. metric_bagging: list Array of the bagging's results. mean_bagging: float Mean of the bagging's results. std_bagging: float Standard deviation of the bagging's results.","title":"Utility attributes"},{"location":"API/models/#prediction-attributes","text":"The prediction attributes are not calculated until the attribute is called for the first time. This mechanism avoids having to calculate attributes that are never used, saving time and memory. Prediction attributes: predict_train: np.ndarray Predictions of the model on the training set. predict_test: np.ndarray Predictions of the model on the test set. predict_proba_train: np.ndarray Predicted probabilities of the model on the training set. Only for estimators with a predict_proba method. predict_proba_test: np.ndarray Predicted probabilities of the model on the test set. Only for estimators with a predict_proba method. predict_log_proba_train: np.ndarray Predicted log probabilities of the model on the training set. Only for estimators with a predict_proba method. predict_log_proba_test: np.ndarray Predicted log probabilities of the model on the test set. Only for estimators with a predict_proba method. decision_function_train: np.ndarray Decision function scores on the training set. Only for estimators with a decision_function method. decision_function_test: np.ndarray Decision function scores on the test set. Only for estimators with a decision_function method. score_train: np.float64 Model's score on the training set. score_test: np.float64 Model's score on the test set.","title":"Prediction attributes"},{"location":"API/models/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/models/#methods","text":"The majority of the plots and prediction methods can be called directly from the models , e.g. atom.xgb.plot_roc() or atom.xgb.predict_proba(X) . The remaining utility methods can be found hereunder: calibrate Calibrate the model. reset_prediction_attributes Clear all the prediction attributes. scoring Get the scoring of a specific metric on the test set. save_estimator Save the estimator to a pickle file. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done using the CalibratedClassifierCV class from sklearn. The estimator will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method reset_prediction_attributes () [source] Clear all the prediction attributes. Use this method to free some memory before saving the class. method scoring (metric=None, dataset='test') [source] Get the scoring of a specific metric on the test set. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method save_estimator (filename=None) [source] Save the estimator to a pickle file. Parameters: filename: str or None, optional (default=None) Name of the file to save. If None or 'auto', the default name is used.","title":"Methods"},{"location":"API/ATOM/atomclassifier/","text":"ATOMClassifier class atom.api. ATOMClassifier (X, y=-1, n_rows=1, test_size=0.2, logger=None, n_jobs=1, warnings=True, verbose=0, random_state=None) [source] ATOMClassifier is ATOM's wrapper for binary and multiclass classification tasks. Use this class to easily apply all data transformations and model management provided by the package on a given dataset. Note that contrary to scikit-learn's API, the ATOMClassifier object already contains the dataset on which we want to perform the analysis. Calling a method will automatically apply it on the dataset it contains. The class initializer always calls StandardCleaner with default parameters. The following data types can't (yet) be handled properly and are therefore removed: 'datetime64', 'datetime64[ns]', 'timedelta[ns]'. You can predict , plot and call any model from the ATOMClassifier instance. Read more in the user guide . Parameters: X: dict, sequence, np.array or pd.DataFrame Dataset containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Position of the target column in X. The default value selects the last column. If string: Name of the target column in X Else: Data target column with shape=(n_samples,) n_rows: int or float, optional (default=1) if <=1: Fraction of the data to use. if >1: Number of rows to use. test_size: float, optional (default=0.2) Split fraction for the training and test set. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. warnings: bool or str, optional (default=True) If True: Default warning action (equal to 'default' when string). If False: Suppress all warnings (equal to 'ignore' when string). If str: One of the possible actions in python's warnings environment. Note that changing this parameter will affect the PYTHONWARNINGS environment. Note that ATOM can't manage warnings that go directly from C++ code to the stdout/stderr. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' for default name. If class: python Logger object'. Note that warnings will not be saved to the logger in any case. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The dataset within ATOM's pipeline can be accessed at any time through multiple properties, e.g. calling atom.train will return the training set. The data can also be changed through these properties, e.g. atom.test = atom.test.drop(0) will drop the first row from the test set. This will also update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. mapping: dict Dictionary of the target categories mapped to their respective encoded integer. missing: pd.Series Returns columns with number of missing values. n_missing: int Number of columns with missing values. categorical: list Returns columns with categorical features. n_categorical: int Number of columns with categorical features. scaled: bool Returns whether the feature set is scaled. Utility attributes Attributes: profile: ProfileReport Profile created by pandas-profiling after calling the report method. genetic_features: pd.DataFrame Dataframe of the non-linear features created by the feature_generation method. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. collinear: pd.DataFrame Dataframe of the collinear features removed by the feature_selection method. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Utility methods The ATOM class contains a variety of methods to help you handle the data and inspect the pipeline. calibrate Calibrate the winning model. clear Remove a model from the pipeline. log Save information to the logger and print to stdout. report Get an extensive profile analysis of the data. save Save the ATOMClassifier instance to a pickle file. scoring Print the scoring of the models for a specific metric. stats Print out a list of basic statistics on the dataset. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method report (dataset='dataset', n_rows=None, filename=None) [source] Get an extensive profile analysis of the data. The report is rendered in HTML5 and CSS3 and saved to the profile attribute. Note that this method can be slow for n_rows > 10k. Parameters: df: str, optional (default='dataset') Name of the data set to get the profile from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows to process. None for all rows. filename: str or None, optional (default=None) Name of the file when saved (as .html). None to not save anything. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method stats () [source] Print out a list of basic information on the dataset. Data cleaning ATOM provides data cleaning methods to scale your features and handle missing values, categorical columns, outliers and unbalanced datasets. Calling on one of them will automatically apply the method on the dataset in the pipeline. Tip Use the report method to examine the data and help you determine suitable parameters for the data cleaning methods. scale Scale all the features to mean=1 and std=0. impute Handle missing values in the dataset. encode Encode categorical features. outliers Remove or replace outliers in the training set. balance Balance the target categories in the training set. method scale () [source] Scale the features to mean=1 and std=0. method impute (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None) [source] Handle missing values according to the selected strategy. Also removes rows and columns with too many missing values. The imputer is fitted only on the training set to avoid data leakage. See Imputer for a description of the parameters. Note that since the Imputer can remove rows from both train and test set, the set's sizes may change to keep ATOM's test_size ratio. method encode (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value 'other' in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in 'ifu'. Will raise an error if it encounters missing values or unknown categories when transforming. The encoder is fitted only on the training set to avoid data leakage. See Encoder for a description of the parameters. method outliers (strategy='drop', max_sigma=3, include_target=False) [source] Remove or replace outliers in the training set. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Only outliers from the training set are removed to maintain an original sample of target values in the test set. Ignores categorical columns. See Outliers for a description of the parameters. method balance (strategy='ADASYN', **kwargs) [source] Balance the number of instances per target category in the training set. Only the training set is balanced in order to maintain the original distribution of target categories in the test set. See Balancer for a description of the parameters. Feature engineering To further pre-process the data you can create new non-linear features transforming the existing ones or, if your dataset is too large, remove features using one of the provided strategies. feature_generation Create new features from combinations of existing ones. feature_selection Remove features according to the selected strategy. method feature_generation (strategy='DFS', n_features=None, generations=20, population=500, operators=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. See FeatureGenerator for a description of the parameters. Attributes created by the class are attached to the ATOM instance. method feature_selection (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Also removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. See FeatureSelector for a description of the parameters. Plotting methods and attributes created by the class are attached to the instance. Note When strategy='univariate' and solver=None, f_classif will be used as default solver. When strategy is one of 'SFM', 'RFE' or 'RFECV' and the solver is one of ATOM's models, the algorithm will automatically select the classifier (no need to add _class to the solver). When strategy is one of 'SFM', 'RFE' or 'RFECV' and solver=None, ATOM will use the winning model (if it exists) as solver. When strategy='RFECV', ATOM will use the metric in the pipeline (if it exists) as the scoring parameter (only if not specified manually). Training The training methods are where the models are fitted to the data and their performance is evaluated according to the selected metric. ATOMClassifier contains three methods to call the training classes from the ATOM package. All relevant attributes and methods from the training classes are attached to ATOMClassifier for convenience. These include the errors, winner and results attributes, the models , and the prediction and plotting methods. run Fit the models to the data in a direct fashion. successive_halving Fit the models to the data in a successive halving fashion. train_sizing Fit the models to the data in a train sizing fashion. method run (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainerClassifier instance. Using this class through atom allows subsequent runs with different models without losing previous information. method successive_halving (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a SuccessiveHalvingClassifier instance. method train_sizing (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainSizingClassifier instance. Example from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier X, y = load_breast_cancer(return_X_y=True) # Initialize class atom = ATOMClassifier(X, y, logger='auto', n_jobs=2, verbose=2) # Apply data cleaning methods atom.outliers(strategy='min_max', max_sigma=2) atom.balance(strategy='smote', sampling_strategy=0.7) # Fit the models to the data atom.run(models=['QDA', 'CatB'], metric='precision', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Analyze the results print(f\"The winning model is: {atom.winner.name}\") print(atom.results) # Make some plots atom.palette = 'Blues' atom.plot_roc(figsize=(9, 6), filename='roc.png') atom.CatB.plot_feature_importance(filename='catboost_feature_importance.png') # Run an extra model atom.run(models='LR', metric='precision', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Get the predictions for the best model on new data predictions = atom.predict(X_new)","title":"ATOMClassifier"},{"location":"API/ATOM/atomclassifier/#atomclassifier","text":"class atom.api. ATOMClassifier (X, y=-1, n_rows=1, test_size=0.2, logger=None, n_jobs=1, warnings=True, verbose=0, random_state=None) [source] ATOMClassifier is ATOM's wrapper for binary and multiclass classification tasks. Use this class to easily apply all data transformations and model management provided by the package on a given dataset. Note that contrary to scikit-learn's API, the ATOMClassifier object already contains the dataset on which we want to perform the analysis. Calling a method will automatically apply it on the dataset it contains. The class initializer always calls StandardCleaner with default parameters. The following data types can't (yet) be handled properly and are therefore removed: 'datetime64', 'datetime64[ns]', 'timedelta[ns]'. You can predict , plot and call any model from the ATOMClassifier instance. Read more in the user guide . Parameters: X: dict, sequence, np.array or pd.DataFrame Dataset containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Position of the target column in X. The default value selects the last column. If string: Name of the target column in X Else: Data target column with shape=(n_samples,) n_rows: int or float, optional (default=1) if <=1: Fraction of the data to use. if >1: Number of rows to use. test_size: float, optional (default=0.2) Split fraction for the training and test set. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. warnings: bool or str, optional (default=True) If True: Default warning action (equal to 'default' when string). If False: Suppress all warnings (equal to 'ignore' when string). If str: One of the possible actions in python's warnings environment. Note that changing this parameter will affect the PYTHONWARNINGS environment. Note that ATOM can't manage warnings that go directly from C++ code to the stdout/stderr. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' for default name. If class: python Logger object'. Note that warnings will not be saved to the logger in any case. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"ATOMClassifier"},{"location":"API/ATOM/atomclassifier/#attributes","text":"","title":"Attributes"},{"location":"API/ATOM/atomclassifier/#data-attributes","text":"The dataset within ATOM's pipeline can be accessed at any time through multiple properties, e.g. calling atom.train will return the training set. The data can also be changed through these properties, e.g. atom.test = atom.test.drop(0) will drop the first row from the test set. This will also update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. mapping: dict Dictionary of the target categories mapped to their respective encoded integer. missing: pd.Series Returns columns with number of missing values. n_missing: int Number of columns with missing values. categorical: list Returns columns with categorical features. n_categorical: int Number of columns with categorical features. scaled: bool Returns whether the feature set is scaled.","title":"Data attributes"},{"location":"API/ATOM/atomclassifier/#utility-attributes","text":"Attributes: profile: ProfileReport Profile created by pandas-profiling after calling the report method. genetic_features: pd.DataFrame Dataframe of the non-linear features created by the feature_generation method. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. collinear: pd.DataFrame Dataframe of the collinear features removed by the feature_selection method. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/ATOM/atomclassifier/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/ATOM/atomclassifier/#utility-methods","text":"The ATOM class contains a variety of methods to help you handle the data and inspect the pipeline. calibrate Calibrate the winning model. clear Remove a model from the pipeline. log Save information to the logger and print to stdout. report Get an extensive profile analysis of the data. save Save the ATOMClassifier instance to a pickle file. scoring Print the scoring of the models for a specific metric. stats Print out a list of basic statistics on the dataset. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method report (dataset='dataset', n_rows=None, filename=None) [source] Get an extensive profile analysis of the data. The report is rendered in HTML5 and CSS3 and saved to the profile attribute. Note that this method can be slow for n_rows > 10k. Parameters: df: str, optional (default='dataset') Name of the data set to get the profile from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows to process. None for all rows. filename: str or None, optional (default=None) Name of the file when saved (as .html). None to not save anything. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method stats () [source] Print out a list of basic information on the dataset.","title":"Utility methods"},{"location":"API/ATOM/atomclassifier/#data-cleaning","text":"ATOM provides data cleaning methods to scale your features and handle missing values, categorical columns, outliers and unbalanced datasets. Calling on one of them will automatically apply the method on the dataset in the pipeline. Tip Use the report method to examine the data and help you determine suitable parameters for the data cleaning methods. scale Scale all the features to mean=1 and std=0. impute Handle missing values in the dataset. encode Encode categorical features. outliers Remove or replace outliers in the training set. balance Balance the target categories in the training set. method scale () [source] Scale the features to mean=1 and std=0. method impute (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None) [source] Handle missing values according to the selected strategy. Also removes rows and columns with too many missing values. The imputer is fitted only on the training set to avoid data leakage. See Imputer for a description of the parameters. Note that since the Imputer can remove rows from both train and test set, the set's sizes may change to keep ATOM's test_size ratio. method encode (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value 'other' in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in 'ifu'. Will raise an error if it encounters missing values or unknown categories when transforming. The encoder is fitted only on the training set to avoid data leakage. See Encoder for a description of the parameters. method outliers (strategy='drop', max_sigma=3, include_target=False) [source] Remove or replace outliers in the training set. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Only outliers from the training set are removed to maintain an original sample of target values in the test set. Ignores categorical columns. See Outliers for a description of the parameters. method balance (strategy='ADASYN', **kwargs) [source] Balance the number of instances per target category in the training set. Only the training set is balanced in order to maintain the original distribution of target categories in the test set. See Balancer for a description of the parameters.","title":"Data cleaning"},{"location":"API/ATOM/atomclassifier/#feature-engineering","text":"To further pre-process the data you can create new non-linear features transforming the existing ones or, if your dataset is too large, remove features using one of the provided strategies. feature_generation Create new features from combinations of existing ones. feature_selection Remove features according to the selected strategy. method feature_generation (strategy='DFS', n_features=None, generations=20, population=500, operators=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. See FeatureGenerator for a description of the parameters. Attributes created by the class are attached to the ATOM instance. method feature_selection (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Also removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. See FeatureSelector for a description of the parameters. Plotting methods and attributes created by the class are attached to the instance. Note When strategy='univariate' and solver=None, f_classif will be used as default solver. When strategy is one of 'SFM', 'RFE' or 'RFECV' and the solver is one of ATOM's models, the algorithm will automatically select the classifier (no need to add _class to the solver). When strategy is one of 'SFM', 'RFE' or 'RFECV' and solver=None, ATOM will use the winning model (if it exists) as solver. When strategy='RFECV', ATOM will use the metric in the pipeline (if it exists) as the scoring parameter (only if not specified manually).","title":"Feature engineering"},{"location":"API/ATOM/atomclassifier/#training","text":"The training methods are where the models are fitted to the data and their performance is evaluated according to the selected metric. ATOMClassifier contains three methods to call the training classes from the ATOM package. All relevant attributes and methods from the training classes are attached to ATOMClassifier for convenience. These include the errors, winner and results attributes, the models , and the prediction and plotting methods. run Fit the models to the data in a direct fashion. successive_halving Fit the models to the data in a successive halving fashion. train_sizing Fit the models to the data in a train sizing fashion. method run (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainerClassifier instance. Using this class through atom allows subsequent runs with different models without losing previous information. method successive_halving (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a SuccessiveHalvingClassifier instance. method train_sizing (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainSizingClassifier instance.","title":"Training"},{"location":"API/ATOM/atomclassifier/#example","text":"from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier X, y = load_breast_cancer(return_X_y=True) # Initialize class atom = ATOMClassifier(X, y, logger='auto', n_jobs=2, verbose=2) # Apply data cleaning methods atom.outliers(strategy='min_max', max_sigma=2) atom.balance(strategy='smote', sampling_strategy=0.7) # Fit the models to the data atom.run(models=['QDA', 'CatB'], metric='precision', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Analyze the results print(f\"The winning model is: {atom.winner.name}\") print(atom.results) # Make some plots atom.palette = 'Blues' atom.plot_roc(figsize=(9, 6), filename='roc.png') atom.CatB.plot_feature_importance(filename='catboost_feature_importance.png') # Run an extra model atom.run(models='LR', metric='precision', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Get the predictions for the best model on new data predictions = atom.predict(X_new)","title":"Example"},{"location":"API/ATOM/atomloader/","text":"ATOMLoader function ATOMLoader (filename=None, X=None, y=-1, transform_data=True, verbose=None) [source] Load a class instance from a pickle file. If the file is a training instance that was saved using save_data=False , you can load new data into it. If the file is an atom instance, you can also apply all data transformations in the pipeline to the provided data. Parameters: filename: str Name of the pickle file to load. X: dict, sequence, np.array, pd.DataFrame or None, optional (default=None) Data containing the features, with shape=(n_samples, n_features). Only use this parameter if the file is a training instance that was saved using save_data=False . See the save method. y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). This parameter is ignored if X=None. transform_data: bool, optional (default=True) Whether to transform the provided data through all the steps in the instance's pipeline. This parameter is ignored if the loaded file is not an atom instance. verbose: int or None, optional (default=None) Verbosity level of the transformations applied on the new data. If None, use the verbosity from the loaded instance. This parameter is ignored if the loaded file is not an atom instance. Example from atom import ATOMClassifier, ATOMLoader # Save an atom instance to a pickle file atom = ATOMClassifier(X, y) atom.encode(strategy='Helmert', max_onehot=12) atom.run('LR', metric='AP', n_calls=25, n_initial_points=10) atom.save('atom_lr', save_data=False) # Load the class and add the transformed data to the new instance atom_2 = ATOMLoader('atom_lr', X, y, verbose=0)","title":"ATOMLoader"},{"location":"API/ATOM/atomloader/#atomloader","text":"function ATOMLoader (filename=None, X=None, y=-1, transform_data=True, verbose=None) [source] Load a class instance from a pickle file. If the file is a training instance that was saved using save_data=False , you can load new data into it. If the file is an atom instance, you can also apply all data transformations in the pipeline to the provided data. Parameters: filename: str Name of the pickle file to load. X: dict, sequence, np.array, pd.DataFrame or None, optional (default=None) Data containing the features, with shape=(n_samples, n_features). Only use this parameter if the file is a training instance that was saved using save_data=False . See the save method. y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). This parameter is ignored if X=None. transform_data: bool, optional (default=True) Whether to transform the provided data through all the steps in the instance's pipeline. This parameter is ignored if the loaded file is not an atom instance. verbose: int or None, optional (default=None) Verbosity level of the transformations applied on the new data. If None, use the verbosity from the loaded instance. This parameter is ignored if the loaded file is not an atom instance.","title":"ATOMLoader"},{"location":"API/ATOM/atomloader/#example","text":"from atom import ATOMClassifier, ATOMLoader # Save an atom instance to a pickle file atom = ATOMClassifier(X, y) atom.encode(strategy='Helmert', max_onehot=12) atom.run('LR', metric='AP', n_calls=25, n_initial_points=10) atom.save('atom_lr', save_data=False) # Load the class and add the transformed data to the new instance atom_2 = ATOMLoader('atom_lr', X, y, verbose=0)","title":"Example"},{"location":"API/ATOM/atomregressor/","text":"ATOMRegressor class atom.api. ATOMRegressor (X, y=-1, n_rows=1, test_size=0.2, logger=None, n_jobs=1, warnings=True, verbose=0, random_state=None) [source] ATOMRegressor is ATOM's wrapper for regression tasks. Use this class to easily apply all data transformations and model management provided by the package on a given dataset. Note that contrary to scikit-learn's API, the ATOMRegressor object already contains the dataset on which we want to perform the analysis. Calling a method will automatically apply it on the dataset it contains. The class initializer always calls StandardCleaner with default parameters. The following data types can't (yet) be handled properly and are therefore removed: 'datetime64', 'datetime64[ns]', 'timedelta[ns]'. You can predict , plot and call any model from the ATOMRegressor instance. Read more in the user guide . Parameters: X: dict, sequence, np.array or pd.DataFrame Dataset containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Position of the target column in X. The default value selects the last column. If string: Name of the target column in X Else: Data target column with shape=(n_samples,) n_rows: int or float, optional (default=1) if <=1: Fraction of the data to use. if >1: Number of rows to use. test_size: float, optional (default=0.2) Split fraction for the training and test set. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. warnings: bool or str, optional (default=True) If True: Default warning action (equal to 'default' when string). If False: Suppress all warnings (equal to 'ignore' when string). If str: One of the possible actions in python's warnings environment. Note that changing this parameter will affect the PYTHONWARNINGS environment. Note that ATOM can't manage warnings that go directly from C++ code to the stdout/stderr. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' for default name. If class: python Logger object'. Note that warnings will not be saved to the logger in any case. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The dataset within ATOM's pipeline can be accessed at any time through multiple properties, e.g. calling atom.train will return the training set. The data can also be changed through these properties, e.g. atom.test = atom.test.drop(0) will drop the first row from the test set. This will also update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. Utility attributes Attributes: profile: ProfileReport Profile created by pandas-profiling after calling the report method. genetic_features: pd.DataFrame Dataframe of the non-linear features created by the feature_generation method. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. collinear: pd.DataFrame Dataframe of the collinear features removed by the feature_selection method. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Utility methods The ATOM class contains a variety of methods to help you handle the data and inspect the pipeline. clear Remove a model from the pipeline. log Save information to the logger and print to stdout. report Get an extensive profile analysis of the data. save Save the ATOMRegressor instance to a pickle file. scoring Print the scoring of the models for a specific metric. stats Print out a list of basic statistics on the dataset. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method report (dataset='dataset', n_rows=None, filename=None) [source] Get an extensive profile analysis of the data. The report is rendered in HTML5 and CSS3 and saved to the profile attribute. Note that this method can be slow for n_rows > 10k. Parameters: df: str, optional (default='dataset') Name of the data set to get the profile from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows to process. None for all rows. filename: str or None, optional (default=None) Name of the file when saved (as .html). None to not save anything. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method stats () [source] Print out a list of basic information on the dataset. Data cleaning ATOM provides data cleaning methods to scale your features and handle missing values, categorical columns and outliers. Calling on one of them will automatically apply the method on the dataset in the pipeline. Tip Use the report method to examine the data and help you determine suitable parameters for the data cleaning methods. scale Scale all the features to mean=1 and std=0. impute Handle missing values in the dataset. encode Encode categorical features. outliers Remove or replace outliers in the training set. method scale () [source] Scale the features to mean=1 and std=0. method impute (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None) [source] Handle missing values according to the selected strategy. Also removes rows and columns with too many missing values. The imputer is fitted only on the training set to avoid data leakage. See Imputer for a description of the parameters. Note that since the Imputer can remove rows from both train and test set, the set's sizes may change to keep ATOM's test_size ratio. method encode (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value 'other' in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in 'ifu'. Will raise an error if it encounters missing values or unknown categories when transforming. The encoder is fitted only on the training set to avoid data leakage. See Encoder for a description of the parameters. method outliers (strategy='drop', max_sigma=3, include_target=False) [source] Remove or replace outliers in the training set. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Only outliers from the training set are removed to maintain an original sample of target values in the test set. Ignores categorical columns. See Outliers for a description of the parameters. Feature engineering To further pre-process the data you can create new non-linear features transforming the existing ones or, if your dataset is too large, remove features using one of the provided strategies. feature_generation Create new features from combinations of existing ones. feature_selection Remove features according to the selected strategy. method feature_generation (strategy='DFS', n_features=None, generations=20, population=500, operators=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. See FeatureGenerator for a description of the parameters. Attributes created by the class are attached to the ATOM instance. method feature_selection (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Also removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. See FeatureSelector for a description of the parameters. Plotting methods and attributes created by the class are attached to the instance. Note When strategy='univariate' and solver=None, f_regression will be used as default solver. When strategy is one of 'SFM', 'RFE' or 'RFECV' and the solver is one of ATOM's models, the algorithm will automatically select the classifier (no need to add _reg to the solver). When strategy is one of 'SFM', 'RFE' or 'RFECV' and solver=None, ATOM will use the winning model (if it exists) as solver. When strategy='RFECV', ATOM will use the metric in the pipeline (if it exists) as the scoring parameter (only if not specified manually). Training The training methods are where the models are fitted to the data and their performance is evaluated according to the selected metric. ATOMRegressor contains three methods to call the training classes from the ATOM package. All relevant attributes and methods from the training classes are attached to ATOMRegressor for convenience. These include the errors, winner and results attributes, the models , and the prediction and plotting methods. run Fit the models to the data in a direct fashion. successive_halving Fit the models to the data in a successive halving fashion. train_sizing Fit the models to the data in a train sizing fashion. method run (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainerRegressor instance. Using this class through atom allows subsequent runs with different models without losing previous information. method successive_halving (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a SuccessiveHalvingRegressor instance. method train_sizing (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainSizingRegressor instance. Example from sklearn.datasets import load_boston from atom import ATOMRegressor X, y = load_boston(return_X_y=True) # Initialize class atom = ATOMRegressor(X, y, logger='auto', n_jobs=2, verbose=2) # Apply data cleaning methods atom.outliers(strategy='min_max', max_sigma=2, include_target=True) # Fit the models to the data atom.run(models=['OLS', 'BR', 'CatB'], metric='MSE', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Analyze the results print(f\"The winning model is: {atom.winner.name}\") print(atom.results) # Make some plots atom.palette = 'Blues' atom.plot_errors(figsize=(9, 6), filename='errors.png') atom.CatB.plot_feature_importance(filename='catboost_feature_importance.png') # Run an extra model atom.run(models='MLP', metric='MSE', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Get the predictions for the best model on new data predictions = atom.predict(X_new)","title":"ATOMRegressor"},{"location":"API/ATOM/atomregressor/#atomregressor","text":"class atom.api. ATOMRegressor (X, y=-1, n_rows=1, test_size=0.2, logger=None, n_jobs=1, warnings=True, verbose=0, random_state=None) [source] ATOMRegressor is ATOM's wrapper for regression tasks. Use this class to easily apply all data transformations and model management provided by the package on a given dataset. Note that contrary to scikit-learn's API, the ATOMRegressor object already contains the dataset on which we want to perform the analysis. Calling a method will automatically apply it on the dataset it contains. The class initializer always calls StandardCleaner with default parameters. The following data types can't (yet) be handled properly and are therefore removed: 'datetime64', 'datetime64[ns]', 'timedelta[ns]'. You can predict , plot and call any model from the ATOMRegressor instance. Read more in the user guide . Parameters: X: dict, sequence, np.array or pd.DataFrame Dataset containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=-1) If int: Position of the target column in X. The default value selects the last column. If string: Name of the target column in X Else: Data target column with shape=(n_samples,) n_rows: int or float, optional (default=1) if <=1: Fraction of the data to use. if >1: Number of rows to use. test_size: float, optional (default=0.2) Split fraction for the training and test set. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. warnings: bool or str, optional (default=True) If True: Default warning action (equal to 'default' when string). If False: Suppress all warnings (equal to 'ignore' when string). If str: One of the possible actions in python's warnings environment. Note that changing this parameter will affect the PYTHONWARNINGS environment. Note that ATOM can't manage warnings that go directly from C++ code to the stdout/stderr. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' for default name. If class: python Logger object'. Note that warnings will not be saved to the logger in any case. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"ATOMRegressor"},{"location":"API/ATOM/atomregressor/#attributes","text":"","title":"Attributes"},{"location":"API/ATOM/atomregressor/#data-attributes","text":"The dataset within ATOM's pipeline can be accessed at any time through multiple properties, e.g. calling atom.train will return the training set. The data can also be changed through these properties, e.g. atom.test = atom.test.drop(0) will drop the first row from the test set. This will also update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column.","title":"Data attributes"},{"location":"API/ATOM/atomregressor/#utility-attributes","text":"Attributes: profile: ProfileReport Profile created by pandas-profiling after calling the report method. genetic_features: pd.DataFrame Dataframe of the non-linear features created by the feature_generation method. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. collinear: pd.DataFrame Dataframe of the collinear features removed by the feature_selection method. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/ATOM/atomregressor/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/ATOM/atomregressor/#utility-methods","text":"The ATOM class contains a variety of methods to help you handle the data and inspect the pipeline. clear Remove a model from the pipeline. log Save information to the logger and print to stdout. report Get an extensive profile analysis of the data. save Save the ATOMRegressor instance to a pickle file. scoring Print the scoring of the models for a specific metric. stats Print out a list of basic statistics on the dataset. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method report (dataset='dataset', n_rows=None, filename=None) [source] Get an extensive profile analysis of the data. The report is rendered in HTML5 and CSS3 and saved to the profile attribute. Note that this method can be slow for n_rows > 10k. Parameters: df: str, optional (default='dataset') Name of the data set to get the profile from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows to process. None for all rows. filename: str or None, optional (default=None) Name of the file when saved (as .html). None to not save anything. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method stats () [source] Print out a list of basic information on the dataset.","title":"Utility methods"},{"location":"API/ATOM/atomregressor/#data-cleaning","text":"ATOM provides data cleaning methods to scale your features and handle missing values, categorical columns and outliers. Calling on one of them will automatically apply the method on the dataset in the pipeline. Tip Use the report method to examine the data and help you determine suitable parameters for the data cleaning methods. scale Scale all the features to mean=1 and std=0. impute Handle missing values in the dataset. encode Encode categorical features. outliers Remove or replace outliers in the training set. method scale () [source] Scale the features to mean=1 and std=0. method impute (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None) [source] Handle missing values according to the selected strategy. Also removes rows and columns with too many missing values. The imputer is fitted only on the training set to avoid data leakage. See Imputer for a description of the parameters. Note that since the Imputer can remove rows from both train and test set, the set's sizes may change to keep ATOM's test_size ratio. method encode (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value 'other' in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in 'ifu'. Will raise an error if it encounters missing values or unknown categories when transforming. The encoder is fitted only on the training set to avoid data leakage. See Encoder for a description of the parameters. method outliers (strategy='drop', max_sigma=3, include_target=False) [source] Remove or replace outliers in the training set. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Only outliers from the training set are removed to maintain an original sample of target values in the test set. Ignores categorical columns. See Outliers for a description of the parameters.","title":"Data cleaning"},{"location":"API/ATOM/atomregressor/#feature-engineering","text":"To further pre-process the data you can create new non-linear features transforming the existing ones or, if your dataset is too large, remove features using one of the provided strategies. feature_generation Create new features from combinations of existing ones. feature_selection Remove features according to the selected strategy. method feature_generation (strategy='DFS', n_features=None, generations=20, population=500, operators=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. See FeatureGenerator for a description of the parameters. Attributes created by the class are attached to the ATOM instance. method feature_selection (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Also removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. See FeatureSelector for a description of the parameters. Plotting methods and attributes created by the class are attached to the instance. Note When strategy='univariate' and solver=None, f_regression will be used as default solver. When strategy is one of 'SFM', 'RFE' or 'RFECV' and the solver is one of ATOM's models, the algorithm will automatically select the classifier (no need to add _reg to the solver). When strategy is one of 'SFM', 'RFE' or 'RFECV' and solver=None, ATOM will use the winning model (if it exists) as solver. When strategy='RFECV', ATOM will use the metric in the pipeline (if it exists) as the scoring parameter (only if not specified manually).","title":"Feature engineering"},{"location":"API/ATOM/atomregressor/#training","text":"The training methods are where the models are fitted to the data and their performance is evaluated according to the selected metric. ATOMRegressor contains three methods to call the training classes from the ATOM package. All relevant attributes and methods from the training classes are attached to ATOMRegressor for convenience. These include the errors, winner and results attributes, the models , and the prediction and plotting methods. run Fit the models to the data in a direct fashion. successive_halving Fit the models to the data in a successive halving fashion. train_sizing Fit the models to the data in a train sizing fashion. method run (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainerRegressor instance. Using this class through atom allows subsequent runs with different models without losing previous information. method successive_halving (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a SuccessiveHalvingRegressor instance. method train_sizing (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=0, n_initial_points=5, bo_params={}, bagging=None) [source] Runs a TrainSizingRegressor instance.","title":"Training"},{"location":"API/ATOM/atomregressor/#example","text":"from sklearn.datasets import load_boston from atom import ATOMRegressor X, y = load_boston(return_X_y=True) # Initialize class atom = ATOMRegressor(X, y, logger='auto', n_jobs=2, verbose=2) # Apply data cleaning methods atom.outliers(strategy='min_max', max_sigma=2, include_target=True) # Fit the models to the data atom.run(models=['OLS', 'BR', 'CatB'], metric='MSE', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Analyze the results print(f\"The winning model is: {atom.winner.name}\") print(atom.results) # Make some plots atom.palette = 'Blues' atom.plot_errors(figsize=(9, 6), filename='errors.png') atom.CatB.plot_feature_importance(filename='catboost_feature_importance.png') # Run an extra model atom.run(models='MLP', metric='MSE', n_calls=25, n_initial_points=10, bo_params={'cv': 1}, bagging=4) # Get the predictions for the best model on new data predictions = atom.predict(X_new)","title":"Example"},{"location":"API/data_cleaning/balancer/","text":"Balancer class atom.data_cleaning. Balancer (strategy='ADASYN', n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs) [source] Balance the number of rows per target category. Use only for classification tasks. This class can be accessed from atom through the balance method. Read more in the user guide . Parameters: strategy: str, optional (default='ADASYN') Type of algorithm to use for oversampling or undersampling. Choose from one of the estimators available in the imbalanced-learn package. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . **kwargs Additional keyword arguments passed to the strategy estimator. Attributes Attributes: : class Estimator instance (attribute name in all lowercase) used to oversample/undersample the data, e.g. balancer.adasyn for the default option. mapping: dict Dictionary of the target values mapped to their respective encoded integer. Methods fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y) [source] Oversample or undersample the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Balancer Estimator instance. method transform (X, y) [source] Oversample or undersample the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.balance(strategy='NearMiss', sampling_strategy=0.7, n_neighbors=10) or from atom.data_cleaning import Balancer balancer = Balancer(strategy='NearMiss', sampling_strategy=0.7, n_neighbors=10) X_train, y_train = balancer.transform(X_train, y_train)","title":"Balancer"},{"location":"API/data_cleaning/balancer/#balancer","text":"class atom.data_cleaning. Balancer (strategy='ADASYN', n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs) [source] Balance the number of rows per target category. Use only for classification tasks. This class can be accessed from atom through the balance method. Read more in the user guide . Parameters: strategy: str, optional (default='ADASYN') Type of algorithm to use for oversampling or undersampling. Choose from one of the estimators available in the imbalanced-learn package. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . **kwargs Additional keyword arguments passed to the strategy estimator.","title":"Balancer"},{"location":"API/data_cleaning/balancer/#attributes","text":"Attributes: : class Estimator instance (attribute name in all lowercase) used to oversample/undersample the data, e.g. balancer.adasyn for the default option. mapping: dict Dictionary of the target values mapped to their respective encoded integer.","title":"Attributes"},{"location":"API/data_cleaning/balancer/#methods","text":"fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y) [source] Oversample or undersample the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Balancer Estimator instance. method transform (X, y) [source] Oversample or undersample the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column.","title":"Methods"},{"location":"API/data_cleaning/balancer/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.balance(strategy='NearMiss', sampling_strategy=0.7, n_neighbors=10) or from atom.data_cleaning import Balancer balancer = Balancer(strategy='NearMiss', sampling_strategy=0.7, n_neighbors=10) X_train, y_train = balancer.transform(X_train, y_train)","title":"Example"},{"location":"API/data_cleaning/encoder/","text":"Encoder class atom.data_cleaning. Encoder (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None, verbose=0, logger=None, **kwargs) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value other in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in ifu . Will raise an error if it encounters missing values or unknown categories when transforming. This class can be accessed from atom through the encode method. Read more in the user guide . Parameters: strategy: str, optional (default='LeaveOneOut') Type of encoding to use for high cardinality features. Choose from one of the estimators available in the category-encoders package except for: OneHotEncoder: Use the max_onehot parameter. HashingEncoder: Incompatibility of APIs. max_onehot: int or None, optional (default=10) Maximum number of unique values in a feature to perform one-hot-encoding. If None, it will always use strategy when n_unique > 2. frac_to_other: float, optional (default=None) Categories with less occurrences than n_rows * fraction_to_other are replaced with the string other . If None, skip this step. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. **kwargs Additional keyword arguments passed to the strategy estimator. Tip Use atom 's categorical attribute for a list of the categorical columns in the dataset. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: Encoder Fitted instance of self. method fit_transform (X, y) [source] Fit the Encoder and return the encoded data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Encoder Estimator instance. method transform (X, y=None) [source] Encode the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Transformed feature set. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.encode(strategy='CatBoost', max_onehot=5) or from atom.data_cleaning import Encoder encoder = Encoder(strategy='CatBoost', max_onehot=5) encoder.fit(X_train, y_train) X = encoder.transform(X)","title":"Encoder"},{"location":"API/data_cleaning/encoder/#encoder","text":"class atom.data_cleaning. Encoder (strategy='LeaveOneOut', max_onehot=10, frac_to_other=None, verbose=0, logger=None, **kwargs) [source] Perform encoding of categorical features. The encoding type depends on the number of unique values in the column: If n_unique=2, use Label-encoding. If 2 < n_unique <= max_onehot, use OneHot-encoding. If n_unique > max_onehot, use strategy -encoding. Also replaces classes with low occurrences with the value other in order to prevent too high cardinality. Categorical features are defined as all columns whose dtype.kind not in ifu . Will raise an error if it encounters missing values or unknown categories when transforming. This class can be accessed from atom through the encode method. Read more in the user guide . Parameters: strategy: str, optional (default='LeaveOneOut') Type of encoding to use for high cardinality features. Choose from one of the estimators available in the category-encoders package except for: OneHotEncoder: Use the max_onehot parameter. HashingEncoder: Incompatibility of APIs. max_onehot: int or None, optional (default=10) Maximum number of unique values in a feature to perform one-hot-encoding. If None, it will always use strategy when n_unique > 2. frac_to_other: float, optional (default=None) Categories with less occurrences than n_rows * fraction_to_other are replaced with the string other . If None, skip this step. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. **kwargs Additional keyword arguments passed to the strategy estimator. Tip Use atom 's categorical attribute for a list of the categorical columns in the dataset.","title":"Encoder"},{"location":"API/data_cleaning/encoder/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: Encoder Fitted instance of self. method fit_transform (X, y) [source] Fit the Encoder and return the encoded data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Encoder Estimator instance. method transform (X, y=None) [source] Encode the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Transformed feature set.","title":"Methods"},{"location":"API/data_cleaning/encoder/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.encode(strategy='CatBoost', max_onehot=5) or from atom.data_cleaning import Encoder encoder = Encoder(strategy='CatBoost', max_onehot=5) encoder.fit(X_train, y_train) X = encoder.transform(X)","title":"Example"},{"location":"API/data_cleaning/imputer/","text":"Imputer class atom.data_cleaning. Imputer (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None, verbose=0, logger=None) [source] Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. This class can be accessed from atom through the impute method. Read more in the user guide . Parameters: strat_num: str, int or float, optional (default='drop') Imputing strategy for numerical columns. Choose from: 'drop': Drop rows containing missing values. 'mean': Impute with mean of column. 'median': Impute with median of column. 'knn': Impute using a K-Nearest Neighbors approach. 'most_frequent': Impute with most frequent value. int or float: Impute with provided numerical value. strat_cat: str, optional (default='drop') Imputing strategy for categorical columns. Choose from: 'drop': Drop rows containing missing values. 'most_frequent': Impute with most frequent value. str: Impute with provided string. min_frac_rows: float, optional (default=0.5) Minimum fraction of non-missing values in a row. If less, the row is removed. min_frac_cols: float, optional (default=0.5) Minimum fraction of non-missing values in a column. If less, the column is removed. missing: int, float or list, optional (default=None) List of values to treat as 'missing'. None to use the default values: [None, np.NaN, np.inf, -np.inf, '', '?', 'NA', 'nan', 'None', 'inf']. Note that np.NaN , None , np.inf and -np.inf will always be imputed since they are incompatible with most estimators. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Tip Use atom 's missing attribute for an overview of the missing values in the dataset. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: self: Imputer Fitted instance of self. method fit_transform (X, y=None) [source] Fit the Imputer and return the imputed data. Warning Leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: imputer Estimator instance. method transform (X, y=None) [source] Impute the data. Warning Leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,) Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='knn', strat_cat='drop', min_frac_cols=0.8) or from atom.data_cleaning import Imputer imputer = Imputer(strat_num='knn', strat_cat='drop', min_frac_cols=0.8) imputer.fit(X_train, y_train) X = imputer.transform(X)","title":"Imputer"},{"location":"API/data_cleaning/imputer/#imputer","text":"class atom.data_cleaning. Imputer (strat_num='drop', strat_cat='drop', min_frac_rows=0.5, min_frac_cols=0.5, missing=None, verbose=0, logger=None) [source] Impute or remove missing values according to the selected strategy. Also removes rows and columns with too many missing values. This class can be accessed from atom through the impute method. Read more in the user guide . Parameters: strat_num: str, int or float, optional (default='drop') Imputing strategy for numerical columns. Choose from: 'drop': Drop rows containing missing values. 'mean': Impute with mean of column. 'median': Impute with median of column. 'knn': Impute using a K-Nearest Neighbors approach. 'most_frequent': Impute with most frequent value. int or float: Impute with provided numerical value. strat_cat: str, optional (default='drop') Imputing strategy for categorical columns. Choose from: 'drop': Drop rows containing missing values. 'most_frequent': Impute with most frequent value. str: Impute with provided string. min_frac_rows: float, optional (default=0.5) Minimum fraction of non-missing values in a row. If less, the row is removed. min_frac_cols: float, optional (default=0.5) Minimum fraction of non-missing values in a column. If less, the column is removed. missing: int, float or list, optional (default=None) List of values to treat as 'missing'. None to use the default values: [None, np.NaN, np.inf, -np.inf, '', '?', 'NA', 'nan', 'None', 'inf']. Note that np.NaN , None , np.inf and -np.inf will always be imputed since they are incompatible with most estimators. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Tip Use atom 's missing attribute for an overview of the missing values in the dataset.","title":"Imputer"},{"location":"API/data_cleaning/imputer/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: self: Imputer Fitted instance of self. method fit_transform (X, y=None) [source] Fit the Imputer and return the imputed data. Warning Leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: imputer Estimator instance. method transform (X, y=None) [source] Impute the data. Warning Leaving y=None can lead to inconsistencies in data length between X and y if rows are dropped during the transformation. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,) Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided.","title":"Methods"},{"location":"API/data_cleaning/imputer/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='knn', strat_cat='drop', min_frac_cols=0.8) or from atom.data_cleaning import Imputer imputer = Imputer(strat_num='knn', strat_cat='drop', min_frac_cols=0.8) imputer.fit(X_train, y_train) X = imputer.transform(X)","title":"Example"},{"location":"API/data_cleaning/outliers/","text":"Outliers class atom.data_cleaning. Outliers (strategy='drop', max_sigma=3, include_target=False, verbose=0, logger=None) [source] Remove or replace outliers in the data. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Ignores categorical columns. This class can be accessed from atom through the outliers method. Read more in the user guide . Parameters: strategy: int, float or str, optional (default='drop') Strategy to apply on the outliers. Choose from: 'drop': Drop any row with outliers. 'min_max': Replace the outlier with the min or max of the column. Any numerical value with which to replace the outliers. max_sigma: int or float, optional (default=3) Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. include_target: bool, optional (default=False) Whether to include the target column in the transformation. This can be useful for regression tasks. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Methods fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y=None) [source] Apply the outlier strategy on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Outliers Estimator instance. method transform (X, y=None) [source] Apply the outlier strategy on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Only returned if provided. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.outliers(strategy='min_max', max_sigma=2, include_target=True) or from atom.data_cleaning import Outliers outliers = Outliers(strategy='min_max', max_sigma=2, include_target=True) X_train, y_train = outliers.transform(X_train, y_train)","title":"Outliers"},{"location":"API/data_cleaning/outliers/#outliers","text":"class atom.data_cleaning. Outliers (strategy='drop', max_sigma=3, include_target=False, verbose=0, logger=None) [source] Remove or replace outliers in the data. Outliers are defined as values that lie further than max_sigma * standard_deviation away from the mean of the column. Ignores categorical columns. This class can be accessed from atom through the outliers method. Read more in the user guide . Parameters: strategy: int, float or str, optional (default='drop') Strategy to apply on the outliers. Choose from: 'drop': Drop any row with outliers. 'min_max': Replace the outlier with the min or max of the column. Any numerical value with which to replace the outliers. max_sigma: int or float, optional (default=3) Maximum allowed standard deviations from the mean of the column. If more, it is considered an outlier. include_target: bool, optional (default=False) Whether to include the target column in the transformation. This can be useful for regression tasks. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object.","title":"Outliers"},{"location":"API/data_cleaning/outliers/#methods","text":"fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y=None) [source] Apply the outlier strategy on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Outliers Estimator instance. method transform (X, y=None) [source] Apply the outlier strategy on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. X: pd.Series Transformed target column. Only returned if provided.","title":"Methods"},{"location":"API/data_cleaning/outliers/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.outliers(strategy='min_max', max_sigma=2, include_target=True) or from atom.data_cleaning import Outliers outliers = Outliers(strategy='min_max', max_sigma=2, include_target=True) X_train, y_train = outliers.transform(X_train, y_train)","title":"Example"},{"location":"API/data_cleaning/scaler/","text":"Scaler class atom.data_cleaning. Scaler (verbose=0, logger=None) [source] Scales data to mean=0 and std=1. This method is equal to sklearn's StandardScaler except that it returns a dataframe when provided. This class can be accessed from atom through the scale method. Read more in the user guide . Parameters: verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: self: Scaler Fitted instance of self. method fit_transform (X, y=None) [source] Fit the Scaler and return the scaled data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Scaled feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Scaler Estimator instance. method transform (X, y=None) [source] Scale the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Scaled feature set. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.scale() or from atom.data_cleaning import Scaler scaler = Scaler() scaler.fit(X_train) X = scaler.transform(X)","title":"Scaler"},{"location":"API/data_cleaning/scaler/#scaler","text":"class atom.data_cleaning. Scaler (verbose=0, logger=None) [source] Scales data to mean=0 and std=1. This method is equal to sklearn's StandardScaler except that it returns a dataframe when provided. This class can be accessed from atom through the scale method. Read more in the user guide . Parameters: verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object.","title":"Scaler"},{"location":"API/data_cleaning/scaler/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: self: Scaler Fitted instance of self. method fit_transform (X, y=None) [source] Fit the Scaler and return the scaled data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Scaled feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: Scaler Estimator instance. method transform (X, y=None) [source] Scale the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Scaled feature set.","title":"Methods"},{"location":"API/data_cleaning/scaler/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.scale() or from atom.data_cleaning import Scaler scaler = Scaler() scaler.fit(X_train) X = scaler.transform(X)","title":"Example"},{"location":"API/data_cleaning/standard_cleaner/","text":"StandardCleaner class atom.data_cleaning. StandardCleaner (prohibited_types=[], strip_categorical=True, maximum_cardinality=True, minimum_cardinality=True, missing_target=True, map_target=None, verbose=0, logger=None) [source] Performs standard data cleaning steps on a dataset. Use the parameters to choose which transformations to perform. The available steps are: Remove columns with prohibited data types. Strip categorical features from white spaces. Remove categorical columns with maximal cardinality. Remove columns with minimum cardinality. Remove rows with missing values in the target column. Label-encode the target column. This class is automatically called when initializing atom . Read more in the user guide . Parameters: prohibited_types: str or sequence, optional (default=[]) Columns with any of these types will be removed from the dataset. strip_categorical: bool, optional (default=True) Whether to strip the spaces from values in the categorical columns. maximum_cardinality: bool, optional (default=True) Whether to remove categorical columns with maximum cardinality, i.e. the number of unique values is equal to the number of instances. Usually the case for names, IDs, etc... minimum_cardinality: bool, optional (default=True) Whether to remove columns with minimum cardinality, i.e. all values in the column are the same. missing_target: bool, optional (default=True) Whether to remove rows with missing values in the target column. Ignored if y is not provided. map_target: bool or None, optional (default=None) Whether to map the target column to numerical values. Should only be used for classification tasks. If None, infer task from the provided target column and set to True if it is classification. Ignored if y is not provided or if it already consists of ordered integers. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. Attributes Attributes: mapping: dict Dictionary of the target values mapped to their respective encoded integer. Only available if map_target was performed. Methods fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y=None) [source] Apply the data cleaning steps on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: StandardCleaner Estimator instance. method transform (X, y=None) [source] Apply the data cleaning steps on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. Example from atom import ATOMClassifier # ATOM's initializer calls StandardCleaner automatically atom = ATOMClassifier(X, y) or from atom.data_cleaning import StandardCleaner cleaner = StandardCleaner(prohibited_types=['str'], missing_target=True) X, y = cleaner.transform(X, y)","title":"StandardCleaner"},{"location":"API/data_cleaning/standard_cleaner/#standardcleaner","text":"class atom.data_cleaning. StandardCleaner (prohibited_types=[], strip_categorical=True, maximum_cardinality=True, minimum_cardinality=True, missing_target=True, map_target=None, verbose=0, logger=None) [source] Performs standard data cleaning steps on a dataset. Use the parameters to choose which transformations to perform. The available steps are: Remove columns with prohibited data types. Strip categorical features from white spaces. Remove categorical columns with maximal cardinality. Remove columns with minimum cardinality. Remove rows with missing values in the target column. Label-encode the target column. This class is automatically called when initializing atom . Read more in the user guide . Parameters: prohibited_types: str or sequence, optional (default=[]) Columns with any of these types will be removed from the dataset. strip_categorical: bool, optional (default=True) Whether to strip the spaces from values in the categorical columns. maximum_cardinality: bool, optional (default=True) Whether to remove categorical columns with maximum cardinality, i.e. the number of unique values is equal to the number of instances. Usually the case for names, IDs, etc... minimum_cardinality: bool, optional (default=True) Whether to remove columns with minimum cardinality, i.e. all values in the column are the same. missing_target: bool, optional (default=True) Whether to remove rows with missing values in the target column. Ignored if y is not provided. map_target: bool or None, optional (default=None) Whether to map the target column to numerical values. Should only be used for classification tasks. If None, infer task from the provided target column and set to True if it is classification. Ignored if y is not provided or if it already consists of ordered integers. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object.","title":"StandardCleaner"},{"location":"API/data_cleaning/standard_cleaner/#attributes","text":"Attributes: mapping: dict Dictionary of the target values mapped to their respective encoded integer. Only available if map_target was performed.","title":"Attributes"},{"location":"API/data_cleaning/standard_cleaner/#methods","text":"fit_transform Same as transform. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit_transform (X, y=None) [source] Apply the data cleaning steps on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: StandardCleaner Estimator instance. method transform (X, y=None) [source] Apply the data cleaning steps on the data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. y: pd.Series Transformed target column. Only returned if provided.","title":"Methods"},{"location":"API/data_cleaning/standard_cleaner/#example","text":"from atom import ATOMClassifier # ATOM's initializer calls StandardCleaner automatically atom = ATOMClassifier(X, y) or from atom.data_cleaning import StandardCleaner cleaner = StandardCleaner(prohibited_types=['str'], missing_target=True) X, y = cleaner.transform(X, y)","title":"Example"},{"location":"API/feature_engineering/feature_generator/","text":"FeatureGenerator class atom.feature_engineering. FeatureGenerator (strategy='DFS', n_features=None, generations=20, population=500, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. This class can be accessed from atom through the feature_generation method. Read more in the user guide . Parameters: strategy: str, optional (default='DFS') Strategy to crate new features. Choose from: 'DFS' to use Deep Feature Synthesis. 'GFG' or 'genetic' to use Genetic Feature Generation. n_features: int or None, optional (default=None) Number of newly generated features to add to the dataset (if strategy='genetic', no more than 1% of the population). If None, select all created. generations: int, optional (default=20) Number of generations to evolve. Only if strategy='genetic'. population: int, optional (default=500) Number of programs in each generation. Only if strategy='genetic'. operators: str, sequence or None, optional (default=None) Name of the operators to be used on the features (for both strategies). None to use all. Valid options are: 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos', 'tan'. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Tip DFS can create many new features and not all of them will be useful. Use FeatureSelector to reduce the number of features! Warning Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom 's missing property. Warning When using DFS with n_jobs>1 , make sure to protect your code with if __name__ == \"__main__\" . Featuretools uses dask , which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task. Attributes Attributes: symbolic_transformer: class Instance used to calculate the genetic features, from SymbolicTransformer . Only if strategy='genetic'. genetic_features: pd.DataFrame Dataframe of the newly created non-linear features. Only if strategy='genetic'. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: FeatureGenerator Fitted instance of self. method fit_transform (X, y) [source] Fit the FeatureGenerator and return the transformed data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Feature set with the newly generated features. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: FeatureGenerator Estimator instance. method transform (X, y=None) [source] Generate new features. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Feature set with the newly generated features. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_generation(strategy='genetic', n_features=3, generations=30, population=400) or from atom.feature_engineering import FeatureGenerator feature_generator = FeatureGenerator(strategy='genetic', n_features=3, generations=30, population=400) feature_generator.fit(X_train, y_train) X = feature_generator.transform(X)","title":"FeatureGenerator"},{"location":"API/feature_engineering/feature_generator/#featuregenerator","text":"class atom.feature_engineering. FeatureGenerator (strategy='DFS', n_features=None, generations=20, population=500, operators=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Use Deep feature Synthesis or a genetic algorithm to create new combinations of existing features to capture the non-linear relations between the original features. This class can be accessed from atom through the feature_generation method. Read more in the user guide . Parameters: strategy: str, optional (default='DFS') Strategy to crate new features. Choose from: 'DFS' to use Deep Feature Synthesis. 'GFG' or 'genetic' to use Genetic Feature Generation. n_features: int or None, optional (default=None) Number of newly generated features to add to the dataset (if strategy='genetic', no more than 1% of the population). If None, select all created. generations: int, optional (default=20) Number of generations to evolve. Only if strategy='genetic'. population: int, optional (default=500) Number of programs in each generation. Only if strategy='genetic'. operators: str, sequence or None, optional (default=None) Name of the operators to be used on the features (for both strategies). None to use all. Valid options are: 'add', 'sub', 'mul', 'div', 'sqrt', 'log', 'sin', 'cos', 'tan'. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Tip DFS can create many new features and not all of them will be useful. Use FeatureSelector to reduce the number of features! Warning Using the div, log or sqrt operators can return new features with inf or NaN values. Check the warnings that may pop up or use atom 's missing property. Warning When using DFS with n_jobs>1 , make sure to protect your code with if __name__ == \"__main__\" . Featuretools uses dask , which uses python multiprocessing for parallelization. The spawn method on multiprocessing starts a new python process, which requires it to import the __main__ module before it can do its task.","title":"FeatureGenerator"},{"location":"API/feature_engineering/feature_generator/#attributes","text":"Attributes: symbolic_transformer: class Instance used to calculate the genetic features, from SymbolicTransformer . Only if strategy='genetic'. genetic_features: pd.DataFrame Dataframe of the newly created non-linear features. Only if strategy='genetic'. Columns include: name: Name of the feature (automatically created). description: Operators used to create this feature. fitness: Fitness score.","title":"Attributes"},{"location":"API/feature_engineering/feature_generator/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y) [source] Fit the class. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: FeatureGenerator Fitted instance of self. method fit_transform (X, y) [source] Fit the FeatureGenerator and return the transformed data. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Feature set with the newly generated features. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: FeatureGenerator Estimator instance. method transform (X, y=None) [source] Generate new features. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Feature set with the newly generated features.","title":"Methods"},{"location":"API/feature_engineering/feature_generator/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_generation(strategy='genetic', n_features=3, generations=30, population=400) or from atom.feature_engineering import FeatureGenerator feature_generator = FeatureGenerator(strategy='genetic', n_features=3, generations=30, population=400) feature_generator.fit(X_train, y_train) X = feature_generator.transform(X)","title":"Example"},{"location":"API/feature_engineering/feature_selector/","text":"FeatureSelector class atom.feature_engineering. FeatureSelector (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Additionally, removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. This class can be accessed from atom through the feature_selection method. Read more in the user guide . Parameters: strategy: string or None, optional (default=None) Feature selection strategy to use. Choose from: None: Do not perform any feature selection algorithm. 'univariate': Select best features according to a univariate F-test. 'PCA': Perform principal component analysis. 'SFM': Select best features according to a model. 'RFE': Perform recursive feature elimination. 'RFECV': Perform RFE with cross-validated selection. solver: string, callable or None, optional (default=None) Solver or model to use for the feature selection strategy. See the sklearn documentation for an extended description of the choices. Select None for the default option per strategy (not applicable for SFM, RFE and RFECV). for 'univariate', choose from: 'f_classif' 'f_regression' 'mutual_info_classif' 'mutual_info_regression' 'chi2' Any function taking two arrays (X, y), and returning arrays (scores, p-values). See the sklearn documentation . for 'PCA', choose from: 'auto' (default) 'full' 'arpack' 'randomized' for 'SFM', 'RFE' and 'RFECV: Estimator with either a feature_importances_ or coef_ attribute after fitting. You can use one of ATOM's pre-defined models . Add _class or _reg after the model's name to specify a classification or regression task, e.g. solver='LGB_reg' (not necessary if called from an atom instance. No default option. n_features: int, float or None, optional (default=None) Number of features to select. Choose from: if None: Select all features. if < 1: Fraction of the total features to select. if >= 1: Number of features to select. If strategy='SFM' and the threshold parameter is not specified, the threshold will be set to -np.inf in order to make this parameter the number of features to select. If strategy='RFECV', it's the minimum number of features to select. max_frac_repeated: float or None, optional (default=1.) Remove features with the same value in at least this fraction of the total rows. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples. None to skip this step. max_correlation: float or None, optional (default=1.) Minimum value of the Pearson correlation coefficient to identify correlated features. A value of 1 removes on of 2 equal columns. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute. None to skip this step. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . **kwargs Any extra keyword argument for the PCA, SFM, RFE or RFECV estimators. See the corresponding sklearn documentation for the available options. Tip Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead. Warning The RFE and RFECV strategies don't work when the solver is a CatBoost model due to incompatibility of the APIs. Attributes Utility attributes Attributes: collinear: pd.DataFrame Dataframe of the removed collinear features. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. feature_importance: list Remaining features ordered by importance. Only if strategy in ['univariate', 'SFM, 'RFE', 'RFECV']. For RFE and RFECV, the importance is extracted from the external estimator fitted on the reduced set. univariate: class SelectKBest instance used to fit the estimator. Only if strategy='univariate'. scaler: class Scaler instance used to scale the data. Only if strategy='PCA' and the data was not already scaled. pca: class PCA instance used to fit the estimator. Only if strategy='PCA'. sfm: class SelectFromModel instance used to fit the estimator. Only if strategy='SFM'. rfe: class RFE instance used to fit the estimator. Only if strategy='RFE'. rfecv: class RFECV instance used to fit the estimator. Only if strategy='RFECV'. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. plot_pca Plot the explained variance ratio vs the number of components. plot_components Plot the explained variance ratio per component. plot_rfecv Plot the scores obtained by the estimator on the RFECV. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Note that the univariate, sfm (when model is not fitted), rfe and rfecv strategies all need a target column. Leaving it None will raise an exception. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: FeatureSelector Fitted instance of self. method fit_transform (X, y) [source] Fit the FeatureSelector and return the transformed feature set. Note that the univariate, sfm (when model is not fitted), rfe and rfecv strategies need a target column. Leaving it None will raise an exception. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method plot_pca (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the explained variance ratio vs the number of components. See plot_pca for a description of the parameters. method plot_components (show=None, title=None, figsize=None, filename=None, display=True) [source] Plot the explained variance ratio per components. See plot_components for a description of the parameters. method plot_rfecv (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the scores obtained by the estimator fitted on every subset of the data. See plot_rfecv for a description of the parameters. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: FeatureSelector Estimator instance. method transform (X, y=None) [source] Transform the feature set. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Transformed feature set. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(stratgey='pca', n_features=12, whiten=True, max_correlation=0.96) atom.plot_pca(filename='pca', figsize=(8, 5)) or from atom.feature_engineering import FeatureSelector feature_selector = FeatureSelector(stratgey='pca', n_features=12, whiten=True, max_correlation=0.96) feature_selector.fit(X_train, y_train) X = feature_selector.transform(X, y) feature_selector.plot_pca(filename='pca', figsize=(8, 5))","title":"FeatureSelector"},{"location":"API/feature_engineering/feature_selector/#featureselector","text":"class atom.feature_engineering. FeatureSelector (strategy=None, solver=None, n_features=None, max_frac_repeated=1., max_correlation=1., n_jobs=1, verbose=0, logger=None, random_state=None, **kwargs) [source] Remove features according to the selected strategy. Ties between features with equal scores will be broken in an unspecified way. Additionally, removes features with too low variance and finds pairs of collinear features based on the Pearson correlation coefficient. For each pair above the specified limit (in terms of absolute value), it removes one of the two. This class can be accessed from atom through the feature_selection method. Read more in the user guide . Parameters: strategy: string or None, optional (default=None) Feature selection strategy to use. Choose from: None: Do not perform any feature selection algorithm. 'univariate': Select best features according to a univariate F-test. 'PCA': Perform principal component analysis. 'SFM': Select best features according to a model. 'RFE': Perform recursive feature elimination. 'RFECV': Perform RFE with cross-validated selection. solver: string, callable or None, optional (default=None) Solver or model to use for the feature selection strategy. See the sklearn documentation for an extended description of the choices. Select None for the default option per strategy (not applicable for SFM, RFE and RFECV). for 'univariate', choose from: 'f_classif' 'f_regression' 'mutual_info_classif' 'mutual_info_regression' 'chi2' Any function taking two arrays (X, y), and returning arrays (scores, p-values). See the sklearn documentation . for 'PCA', choose from: 'auto' (default) 'full' 'arpack' 'randomized' for 'SFM', 'RFE' and 'RFECV: Estimator with either a feature_importances_ or coef_ attribute after fitting. You can use one of ATOM's pre-defined models . Add _class or _reg after the model's name to specify a classification or regression task, e.g. solver='LGB_reg' (not necessary if called from an atom instance. No default option. n_features: int, float or None, optional (default=None) Number of features to select. Choose from: if None: Select all features. if < 1: Fraction of the total features to select. if >= 1: Number of features to select. If strategy='SFM' and the threshold parameter is not specified, the threshold will be set to -np.inf in order to make this parameter the number of features to select. If strategy='RFECV', it's the minimum number of features to select. max_frac_repeated: float or None, optional (default=1.) Remove features with the same value in at least this fraction of the total rows. The default is to keep all features with non-zero variance, i.e. remove the features that have the same value in all samples. None to skip this step. max_correlation: float or None, optional (default=1.) Minimum value of the Pearson correlation coefficient to identify correlated features. A value of 1 removes on of 2 equal columns. A dataframe of the removed features and their correlation values can be accessed through the collinear attribute. None to skip this step. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . **kwargs Any extra keyword argument for the PCA, SFM, RFE or RFECV estimators. See the corresponding sklearn documentation for the available options. Tip Use the plot_feature_importance method to examine how much a specific feature contributes to the final predictions. If the model doesn't have a feature_importances_ attribute, use plot_permutation_importance instead. Warning The RFE and RFECV strategies don't work when the solver is a CatBoost model due to incompatibility of the APIs.","title":"FeatureSelector"},{"location":"API/feature_engineering/feature_selector/#attributes","text":"","title":"Attributes"},{"location":"API/feature_engineering/feature_selector/#utility-attributes","text":"Attributes: collinear: pd.DataFrame Dataframe of the removed collinear features. Columns include: drop_feature: name of the feature dropped by the method. correlated feature: Name of the correlated feature(s). correlation_value: Pearson correlation coefficient(s) of the feature pairs. feature_importance: list Remaining features ordered by importance. Only if strategy in ['univariate', 'SFM, 'RFE', 'RFECV']. For RFE and RFECV, the importance is extracted from the external estimator fitted on the reduced set. univariate: class SelectKBest instance used to fit the estimator. Only if strategy='univariate'. scaler: class Scaler instance used to scale the data. Only if strategy='PCA' and the data was not already scaled. pca: class PCA instance used to fit the estimator. Only if strategy='PCA'. sfm: class SelectFromModel instance used to fit the estimator. Only if strategy='SFM'. rfe: class RFE instance used to fit the estimator. Only if strategy='RFE'. rfecv: class RFECV instance used to fit the estimator. Only if strategy='RFECV'.","title":"Utility attributes"},{"location":"API/feature_engineering/feature_selector/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/feature_engineering/feature_selector/#methods","text":"fit Fit the class. fit_transform Fit the class and return the transformed data. get_params Get parameters for this estimator. log Write information to the logger and print to stdout. plot_pca Plot the explained variance ratio vs the number of components. plot_components Plot the explained variance ratio per component. plot_rfecv Plot the scores obtained by the estimator on the RFECV. save Save the instance to a pickle file. set_params Set the parameters of this estimator. transform Transform the data. method fit (X, y=None) [source] Fit the class. Note that the univariate, sfm (when model is not fitted), rfe and rfecv strategies all need a target column. Leaving it None will raise an exception. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: self: FeatureSelector Fitted instance of self. method fit_transform (X, y) [source] Fit the FeatureSelector and return the transformed feature set. Note that the univariate, sfm (when model is not fitted), rfe and rfecv strategies need a target column. Leaving it None will raise an exception. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformation. If int: Index of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). Returns: X: pd.DataFrame Transformed feature set. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method plot_pca (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the explained variance ratio vs the number of components. See plot_pca for a description of the parameters. method plot_components (show=None, title=None, figsize=None, filename=None, display=True) [source] Plot the explained variance ratio per components. See plot_components for a description of the parameters. method plot_rfecv (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the scores obtained by the estimator fitted on every subset of the data. See plot_rfecv for a description of the parameters. method save (filename=None) [source] Save the instance to a pickle file. Parameters: filename: str or None, optional (default=None) Name to save the file with. None to save with default name. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: FeatureSelector Estimator instance. method transform (X, y=None) [source] Transform the feature set. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) Does nothing. Implemented for continuity of the API. Returns: X: pd.DataFrame Transformed feature set.","title":"Methods"},{"location":"API/feature_engineering/feature_selector/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(stratgey='pca', n_features=12, whiten=True, max_correlation=0.96) atom.plot_pca(filename='pca', figsize=(8, 5)) or from atom.feature_engineering import FeatureSelector feature_selector = FeatureSelector(stratgey='pca', n_features=12, whiten=True, max_correlation=0.96) feature_selector.fit(X_train, y_train) X = feature_selector.transform(X, y) feature_selector.plot_pca(filename='pca', figsize=(8, 5))","title":"Example"},{"location":"API/plots/decision_plot/","text":"decision_plot method decision_plot (models=None, index=None, show=None, target=1, title=None, figsize=None, filename=None, display=True, **kwargs) [source] Plot SHAP's decision plot. Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values will be printed in the plot (if supplied). If multiple predictions are plotted together, feature values will not be printed. Plotting too many predictions together will make the plot unintelligible. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default=None) Indices of the rows in the dataset to plot. If tuple (n, m), select rows n until m. If None, select all rows in the test set. show: int or None, optional (default=None) Number of features (ordered by importance) to show in the plot. None to show all. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to the number of features. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's decision_plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.decision_plot(index=(120, 140)) atom.decision_plot(index=120)","title":"decision_plot"},{"location":"API/plots/decision_plot/#decision_plot","text":"method decision_plot (models=None, index=None, show=None, target=1, title=None, figsize=None, filename=None, display=True, **kwargs) [source] Plot SHAP's decision plot. Visualize model decisions using cumulative SHAP values. Each plotted line explains a single model prediction. If a single prediction is plotted, feature values will be printed in the plot (if supplied). If multiple predictions are plotted together, feature values will not be printed. Plotting too many predictions together will make the plot unintelligible. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default=None) Indices of the rows in the dataset to plot. If tuple (n, m), select rows n until m. If None, select all rows in the test set. show: int or None, optional (default=None) Number of features (ordered by importance) to show in the plot. None to show all. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to the number of features. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's decision_plot.","title":"decision_plot"},{"location":"API/plots/decision_plot/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.decision_plot(index=(120, 140)) atom.decision_plot(index=120)","title":"Example"},{"location":"API/plots/dependence_plot/","text":"dependence_plot method dependence_plot (models=None, index='rank(1)', target=1, title=None, figsize=(10, 6), filename=None, display=True, **kwargs) [source] Plot SHAP's dependence plot. Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of the classical partial dependence plots. Vertical dispersion of the data points represents interaction effects. Grey ticks along the y-axis are data points where the feature's value was NaN. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default='rank(1)') If this is an int, it is the index of the feature to plot. If this is a string it is either the name of the feature to plot, or it can have the form 'rank(int)' to specify the feature with that rank (ordered by mean absolute SHAP value over all the samples). target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's dependence_plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.dependence_plot(index='rank(3)')","title":"dependence_plot"},{"location":"API/plots/dependence_plot/#dependence_plot","text":"method dependence_plot (models=None, index='rank(1)', target=1, title=None, figsize=(10, 6), filename=None, display=True, **kwargs) [source] Plot SHAP's dependence plot. Plots the value of the feature on the x-axis and the SHAP value of the same feature on the y-axis. This shows how the model depends on the given feature, and is like a richer extension of the classical partial dependence plots. Vertical dispersion of the data points represents interaction effects. Grey ticks along the y-axis are data points where the feature's value was NaN. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default='rank(1)') If this is an int, it is the index of the feature to plot. If this is a string it is either the name of the feature to plot, or it can have the form 'rank(int)' to specify the feature with that rank (ordered by mean absolute SHAP value over all the samples). target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's dependence_plot.","title":"dependence_plot"},{"location":"API/plots/dependence_plot/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.dependence_plot(index='rank(3)')","title":"Example"},{"location":"API/plots/force_plot/","text":"force_plot method force_plot (models=None, index=None, target=1, title=None, figsize=(14, 6), filename=None, display=True, **kwargs) [source] Plot SHAP's force plot. Visualize the given SHAP values with an additive force layout. The explainer will be chosen automatically based on the model's type. Note that by default this plot will render using javascript. For a regular figure use matplotlib=True (this option is only available when only 1 row is selected through the index parameter). Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default=None) Indices of the rows in the dataset to plot. If tuple (n, m), select rows n until m. If None, select all rows in the test set. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(14, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If matplotlib=False, the figure will be saved as an html file. If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's force_plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run('lr') atom.force_plot(index=atom.X_test.index[0], matplotlib=True, filename='force_plot')","title":"force_plot"},{"location":"API/plots/force_plot/#force_plot","text":"method force_plot (models=None, index=None, target=1, title=None, figsize=(14, 6), filename=None, display=True, **kwargs) [source] Plot SHAP's force plot. Visualize the given SHAP values with an additive force layout. The explainer will be chosen automatically based on the model's type. Note that by default this plot will render using javascript. For a regular figure use matplotlib=True (this option is only available when only 1 row is selected through the index parameter). Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . index: int, sequence or None, optional (default=None) Indices of the rows in the dataset to plot. If tuple (n, m), select rows n until m. If None, select all rows in the test set. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(14, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If matplotlib=False, the figure will be saved as an html file. If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's force_plot.","title":"force_plot"},{"location":"API/plots/force_plot/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run('lr') atom.force_plot(index=atom.X_test.index[0], matplotlib=True, filename='force_plot')","title":"Example"},{"location":"API/plots/plot_bagging/","text":"plot_bagging method plot_bagging (models=None, metric=0, title=None, figsize=None, filename=None, display=True) [source] Plot a boxplot of the bagging's results. Only available for models fitted using bagging . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline that used bagging are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=None) Figure's size, format as (x, y). If None, adapts size the to number of models. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'Tree', 'LGB', 'MLP'], metric='accuracy', bagging=5) atom.plot_bagging()","title":"plot_bagging"},{"location":"API/plots/plot_bagging/#plot_bagging","text":"method plot_bagging (models=None, metric=0, title=None, figsize=None, filename=None, display=True) [source] Plot a boxplot of the bagging's results. Only available for models fitted using bagging . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline that used bagging are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=None) Figure's size, format as (x, y). If None, adapts size the to number of models. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_bagging"},{"location":"API/plots/plot_bagging/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'Tree', 'LGB', 'MLP'], metric='accuracy', bagging=5) atom.plot_bagging()","title":"Example"},{"location":"API/plots/plot_bo/","text":"plot_bo method plot_bo (models=None, metric=0, title=None, figsize=(10, 8), filename=None, display=True) [source] Plot the bayesian optimization scoring. Only for models that ran the hyperparameter optimization. This is the same plot as the one produced by bo_params={'plot_bo': True} while running the optimization. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline that used bayesian optimization are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 8)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LDA', 'LGB'], metric='f1', n_calls=24, n_initial_points=10) atom.plot_bo()","title":"plot_bo"},{"location":"API/plots/plot_bo/#plot_bo","text":"method plot_bo (models=None, metric=0, title=None, figsize=(10, 8), filename=None, display=True) [source] Plot the bayesian optimization scoring. Only for models that ran the hyperparameter optimization. This is the same plot as the one produced by bo_params={'plot_bo': True} while running the optimization. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline that used bayesian optimization are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 8)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_bo"},{"location":"API/plots/plot_bo/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LDA', 'LGB'], metric='f1', n_calls=24, n_initial_points=10) atom.plot_bo()","title":"Example"},{"location":"API/plots/plot_calibration/","text":"plot_calibration method plot_calibration (models=None, n_bins=10, title=None, figsize=(10, 10), filename=None, display=True) [source] Plot the calibration curve for a binary classifier. Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level. For instance a well calibrated (binary) classifier should classify the samples such that among the samples to which it gave a predict_proba value close to 0.8, approx. 80% actually belong to the positive class. Read more in sklearn's documentation . This figure shows two plots: the calibration curve, where the x-axis represents the average predicted probability in each bin and the y-axis is the fraction of positives, i.e. the proportion of samples whose class is the positive class (in each bin); and a distribution of all predicted probabilities of the classifier. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. n_bins: int, optional (default=10) Number of bins for the calibration calculation and the histogram. Minimum of 5 required. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 10)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X) atom.run(['GNB', 'LR', 'LGB'], metric='average_precision') atom.plot_calibration()","title":"plot_calibration"},{"location":"API/plots/plot_calibration/#plot_calibration","text":"method plot_calibration (models=None, n_bins=10, title=None, figsize=(10, 10), filename=None, display=True) [source] Plot the calibration curve for a binary classifier. Well calibrated classifiers are probabilistic classifiers for which the output of the predict_proba method can be directly interpreted as a confidence level. For instance a well calibrated (binary) classifier should classify the samples such that among the samples to which it gave a predict_proba value close to 0.8, approx. 80% actually belong to the positive class. Read more in sklearn's documentation . This figure shows two plots: the calibration curve, where the x-axis represents the average predicted probability in each bin and the y-axis is the fraction of positives, i.e. the proportion of samples whose class is the positive class (in each bin); and a distribution of all predicted probabilities of the classifier. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. n_bins: int, optional (default=10) Number of bins for the calibration calculation and the histogram. Minimum of 5 required. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 10)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_calibration"},{"location":"API/plots/plot_calibration/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X) atom.run(['GNB', 'LR', 'LGB'], metric='average_precision') atom.plot_calibration()","title":"Example"},{"location":"API/plots/plot_components/","text":"plot_components method plot_components (show=None, title=None, figsize=None, filename=None, display=True) [source] Plot the explained variance ratio per components. Only available if PCA was applied on the data. Parameters: show: int or None, optional (default=None) Number of components to show. If None, the number of components in the data are plotted. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=11) atom.plot_components()","title":"plot_components"},{"location":"API/plots/plot_components/#plot_components","text":"method plot_components (show=None, title=None, figsize=None, filename=None, display=True) [source] Plot the explained variance ratio per components. Only available if PCA was applied on the data. Parameters: show: int or None, optional (default=None) Number of components to show. If None, the number of components in the data are plotted. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_components"},{"location":"API/plots/plot_components/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=11) atom.plot_components()","title":"Example"},{"location":"API/plots/plot_confusion_matrix/","text":"plot_confusion_matrix method plot_confusion_matrix (models=None, dataset='test', normalize=False, title=None, figsize=None, filename=None, display=True) [source] Plot a model's confusion matrix. Only for classification tasks. For 1 model: plot the confusion matrix in a heatmap. For multiple models: compare TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the confusion matrix. Options are 'train' or 'test'. normalize: bool, optional (default=False) Whether to normalize the matrix. Only for the heatmap plot. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=None) Figure's size, format as (x, y). If None, adapts size to plot type. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'Bag']) atom.Tree.plot_confusion_matrix(normalize=True) atom.plot_confusion_matrix()","title":"plot_confusion_matrix"},{"location":"API/plots/plot_confusion_matrix/#plot_confusion_matrix","text":"method plot_confusion_matrix (models=None, dataset='test', normalize=False, title=None, figsize=None, filename=None, display=True) [source] Plot a model's confusion matrix. Only for classification tasks. For 1 model: plot the confusion matrix in a heatmap. For multiple models: compare TP, FP, FN and TN in a barplot (not implemented for multiclass classification tasks). Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the confusion matrix. Options are 'train' or 'test'. normalize: bool, optional (default=False) Whether to normalize the matrix. Only for the heatmap plot. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=None) Figure's size, format as (x, y). If None, adapts size to plot type. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_confusion_matrix"},{"location":"API/plots/plot_confusion_matrix/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'Bag']) atom.Tree.plot_confusion_matrix(normalize=True) atom.plot_confusion_matrix()","title":"Example"},{"location":"API/plots/plot_correlation/","text":"plot_correlation method plot_correlation (method='pearson', title=None, figsize=(8, 8), filename=None, display=True) [source] Plot the data's correlation matrix. Ignores non-numeric columns. Parameters: method: str, optional (default='pearson') Method of correlation. Choose from 'pearson', 'kendall' or 'spearman'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(8, 8)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y='RainTomorrow') atom.plot_correlation()","title":"plot_correlation"},{"location":"API/plots/plot_correlation/#plot_correlation","text":"method plot_correlation (method='pearson', title=None, figsize=(8, 8), filename=None, display=True) [source] Plot the data's correlation matrix. Ignores non-numeric columns. Parameters: method: str, optional (default='pearson') Method of correlation. Choose from 'pearson', 'kendall' or 'spearman'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(8, 8)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_correlation"},{"location":"API/plots/plot_correlation/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y='RainTomorrow') atom.plot_correlation()","title":"Example"},{"location":"API/plots/plot_errors/","text":"plot_errors method plot_errors (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot a model's prediction errors, i.e. the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This pot can be useful to detect noise or heteroscedasticity along a range of the target domain. Only for regression tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the errors. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['OLS', 'LGB'], metric='MAE') atom.plot_errors()","title":"plot_errors"},{"location":"API/plots/plot_errors/#plot_errors","text":"method plot_errors (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot a model's prediction errors, i.e. the actual targets from a set against the predicted values generated by the regressor. A linear fit is made on the data. The gray, intersected line shows the identity line. This pot can be useful to detect noise or heteroscedasticity along a range of the target domain. Only for regression tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the errors. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_errors"},{"location":"API/plots/plot_errors/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['OLS', 'LGB'], metric='MAE') atom.plot_errors()","title":"Example"},{"location":"API/plots/plot_evals/","text":"plot_evals method plot_evals (models=None, dataset='both', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot evaluation curves for the train and test set. Only for models that allow in-training evaluation (XGB, LGB, CatB). The metric is provided by the model's package and is different for every model and every task. For this reason, the method only allows plotting one model at a time. Parameters: models: str, sequence or None, optional (default=None) Name of the model to plot. If None, all models in the pipeline are selected. Note that leaving the default option could raise an exception if there are multiple models in the pipeline. To avoid this, call the plot from a model , e.g. atom.lgb.plot_evals() . dataset: str, optional (default='both') Data set on which to calculate the evaluation curves. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['Bag', 'LGB']) atom.lgb.plot_evals()","title":"plot_evals"},{"location":"API/plots/plot_evals/#plot_evals","text":"method plot_evals (models=None, dataset='both', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot evaluation curves for the train and test set. Only for models that allow in-training evaluation (XGB, LGB, CatB). The metric is provided by the model's package and is different for every model and every task. For this reason, the method only allows plotting one model at a time. Parameters: models: str, sequence or None, optional (default=None) Name of the model to plot. If None, all models in the pipeline are selected. Note that leaving the default option could raise an exception if there are multiple models in the pipeline. To avoid this, call the plot from a model , e.g. atom.lgb.plot_evals() . dataset: str, optional (default='both') Data set on which to calculate the evaluation curves. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_evals"},{"location":"API/plots/plot_evals/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['Bag', 'LGB']) atom.lgb.plot_evals()","title":"Example"},{"location":"API/plots/plot_feature_importance/","text":"plot_feature_importance method plot_feature_importance (models=None, show=None, title=None, figsize=None, filename=None, display=True) [source] Plot a tree-based model's feature importance. The importances are normalized in order to be able to compare them between models. The feature_importance attribute is updated with the extracted importance ranking. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. show: int, optional (default=None) Number of best features to show in the plot. None to show all. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF'], metric='recall_weighted') atom.RF.plot_feature_importance(show=11, filename='random_forest_importance.png')","title":"plot_feature_importance"},{"location":"API/plots/plot_feature_importance/#plot_feature_importance","text":"method plot_feature_importance (models=None, show=None, title=None, figsize=None, filename=None, display=True) [source] Plot a tree-based model's feature importance. The importances are normalized in order to be able to compare them between models. The feature_importance attribute is updated with the extracted importance ranking. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. show: int, optional (default=None) Number of best features to show in the plot. None to show all. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_feature_importance"},{"location":"API/plots/plot_feature_importance/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF'], metric='recall_weighted') atom.RF.plot_feature_importance(show=11, filename='random_forest_importance.png')","title":"Example"},{"location":"API/plots/plot_gains/","text":"plot_gains method plot_gains (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the cumulative gains curve. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the gains curve. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['GNB', 'RF', 'LGB'], metric='roc_auc') atom.plot_gains(filename='cumulative_gains_curve.png')","title":"plot_gains"},{"location":"API/plots/plot_gains/#plot_gains","text":"method plot_gains (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the cumulative gains curve. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the gains curve. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_gains"},{"location":"API/plots/plot_gains/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['GNB', 'RF', 'LGB'], metric='roc_auc') atom.plot_gains(filename='cumulative_gains_curve.png')","title":"Example"},{"location":"API/plots/plot_learning_curve/","text":"plot_learning_curve method plot_learning_curve (models=None, metric=0, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the model's learning curve: score vs number of training samples. Only available if the models were fitted using train sizing . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example import numpy as np from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.train_sizing(['GNB', 'LDA'], metric='accuracy', train_sizes=np.linspace(0.1, 1.0, 9), bagging=5) atom.plot_learning_curve()","title":"plot_learning_curve"},{"location":"API/plots/plot_learning_curve/#plot_learning_curve","text":"method plot_learning_curve (models=None, metric=0, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the model's learning curve: score vs number of training samples. Only available if the models were fitted using train sizing . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_learning_curve"},{"location":"API/plots/plot_learning_curve/#example","text":"import numpy as np from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.train_sizing(['GNB', 'LDA'], metric='accuracy', train_sizes=np.linspace(0.1, 1.0, 9), bagging=5) atom.plot_learning_curve()","title":"Example"},{"location":"API/plots/plot_lift/","text":"plot_lift method plot_lift (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the lift curve. Only for binary classification. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the lift curve. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['GNB', 'RF', 'LGB'], metric='roc_auc') atom.plot_lift(filename='lift_curve.png')","title":"plot_lift"},{"location":"API/plots/plot_lift/#plot_lift","text":"method plot_lift (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the lift curve. Only for binary classification. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the lift curve. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_lift"},{"location":"API/plots/plot_lift/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['GNB', 'RF', 'LGB'], metric='roc_auc') atom.plot_lift(filename='lift_curve.png')","title":"Example"},{"location":"API/plots/plot_partial_dependence/","text":"plot_partial_dependence method plot_partial_dependence (models=None, features=None, target=None, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the partial dependence of features. The partial dependence of a feature (or a set of features) corresponds to the average response of the model for each possible value of the feature. Two-way partial dependence plots are plotted as contour plots (only allowed for single model plots). The deciles of the feature values will be shown with tick marks on the x-axes for one-way plots, and on both axes for two-way plots. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. features: int, str, sequence or None, optional (default=None) Features or feature pairs (name or index) to get the partial dependence from. Maximum of 3 allowed. If None, it uses the top 3 features if feature_importance is defined (see plot_feature_importance or plot_permutation_importance ), else it uses the first 3 features in the dataset. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=6) atom.run(['Tree', 'Bag'], metric='precision') atom.Tree.plot_partial_dependence(features=[0, 1, (1, 3)]) atom.plot_partial_dependence()","title":"plot_partial_dependence"},{"location":"API/plots/plot_partial_dependence/#plot_partial_dependence","text":"method plot_partial_dependence (models=None, features=None, target=None, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the partial dependence of features. The partial dependence of a feature (or a set of features) corresponds to the average response of the model for each possible value of the feature. Two-way partial dependence plots are plotted as contour plots (only allowed for single model plots). The deciles of the feature values will be shown with tick marks on the x-axes for one-way plots, and on both axes for two-way plots. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. features: int, str, sequence or None, optional (default=None) Features or feature pairs (name or index) to get the partial dependence from. Maximum of 3 allowed. If None, it uses the top 3 features if feature_importance is defined (see plot_feature_importance or plot_permutation_importance ), else it uses the first 3 features in the dataset. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_partial_dependence"},{"location":"API/plots/plot_partial_dependence/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=6) atom.run(['Tree', 'Bag'], metric='precision') atom.Tree.plot_partial_dependence(features=[0, 1, (1, 3)]) atom.plot_partial_dependence()","title":"Example"},{"location":"API/plots/plot_pca/","text":"plot_pca method plot_pca (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the explained variance ratio vs the number of components. Only available if PCA was applied on the data. Parameters: title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=11) atom.plot_pca()","title":"plot_pca"},{"location":"API/plots/plot_pca/#plot_pca","text":"method plot_pca (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the explained variance ratio vs the number of components. Only available if PCA was applied on the data. Parameters: title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_pca"},{"location":"API/plots/plot_pca/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='PCA', n_features=11) atom.plot_pca()","title":"Example"},{"location":"API/plots/plot_permutation_importance/","text":"plot_permutation_importance method plot_permutation_importance (models=None, show=None, n_repeats=10, title=None, figsize=None, filename=None, display=True) [source] Plot the feature permutation importance of models. Calculating all permutations can be time consuming, especially if n_repeats is high. They are stored under the attribute permutations . This means that if a plot is repeated for the same model with the same n_repeats , it will be considerably faster. The feature_importance attribute is updated with the extracted importance ranking. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. show: int, optional (default=None) Number of best features to show in the plot. None to show all. n_repeats: int, optional (default=10) Number of times to permute each feature. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'LDA'], metric='average_precision') atom.LDA.plot_permutation_importance(show=10, n_repeats=7)","title":"plot_permutation_importance"},{"location":"API/plots/plot_permutation_importance/#plot_permutation_importance","text":"method plot_permutation_importance (models=None, show=None, n_repeats=10, title=None, figsize=None, filename=None, display=True) [source] Plot the feature permutation importance of models. Calculating all permutations can be time consuming, especially if n_repeats is high. They are stored under the attribute permutations . This means that if a plot is repeated for the same model with the same n_repeats , it will be considerably faster. The feature_importance attribute is updated with the extracted importance ranking. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. show: int, optional (default=None) Number of best features to show in the plot. None to show all. n_repeats: int, optional (default=10) Number of times to permute each feature. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_permutation_importance"},{"location":"API/plots/plot_permutation_importance/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'LDA'], metric='average_precision') atom.LDA.plot_permutation_importance(show=10, n_repeats=7)","title":"Example"},{"location":"API/plots/plot_pipeline/","text":"plot_pipeline method plot_pipeline (show_params=True, title=None, figsize=None, filename=None, display=True) [source] Plot a diagram of every estimator in atom 's pipeline. Parameters: show_params: bool, optional (default=True) Whether to show the parameters used for every estimator. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to the length of the pipeline. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='median', strat_cat='drop', min_frac_rows=0.8) atom.encode(strategy='LeaveOneOut', max_onehot=8, frac_to_other=0.02) atom.outliers(strategy='drop', max_sigma=4, include_target=False) atom.feature_selection(strategy='PCA', n_features=10, max_frac_repeated=1., max_correlation=0.7) atom.run(['GBM', 'LGB'], metric='recall_weighted', n_calls=(10, 20), n_initial_points=(5, 12), bo_params={'base_estimator': 'RF', 'cv': 1, 'max_time': 1000}, bagging=4) atom.plot_pipeline()","title":"plot_pipeline"},{"location":"API/plots/plot_pipeline/#plot_pipeline","text":"method plot_pipeline (show_params=True, title=None, figsize=None, filename=None, display=True) [source] Plot a diagram of every estimator in atom 's pipeline. Parameters: show_params: bool, optional (default=True) Whether to show the parameters used for every estimator. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to the length of the pipeline. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_pipeline"},{"location":"API/plots/plot_pipeline/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='median', strat_cat='drop', min_frac_rows=0.8) atom.encode(strategy='LeaveOneOut', max_onehot=8, frac_to_other=0.02) atom.outliers(strategy='drop', max_sigma=4, include_target=False) atom.feature_selection(strategy='PCA', n_features=10, max_frac_repeated=1., max_correlation=0.7) atom.run(['GBM', 'LGB'], metric='recall_weighted', n_calls=(10, 20), n_initial_points=(5, 12), bo_params={'base_estimator': 'RF', 'cv': 1, 'max_time': 1000}, bagging=4) atom.plot_pipeline()","title":"Example"},{"location":"API/plots/plot_prc/","text":"plot_prc method plot_prc (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the precision-recall curve. The legend shows the average precision (AP) score. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF', 'LGB'], metric='average_precision') atom.plot_prc()","title":"plot_prc"},{"location":"API/plots/plot_prc/#plot_prc","text":"method plot_prc (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the precision-recall curve. The legend shows the average precision (AP) score. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_prc"},{"location":"API/plots/plot_prc/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF', 'LGB'], metric='average_precision') atom.plot_prc()","title":"Example"},{"location":"API/plots/plot_probabilities/","text":"plot_probabilities method plot_probabilities (models=None, dataset='test', target=1, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the probability distribution of the categories in the target column. Only for classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. target: int or str, optional (default=1) Probability of being that category in the target column as index or name. Only for multiclass classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y='RainTomorrow') atom.run('rf') atom.plot_probabilities(target='Yes', filenmae='probabilities_category_yes')","title":"plot_probabilities"},{"location":"API/plots/plot_probabilities/#plot_probabilities","text":"method plot_probabilities (models=None, dataset='test', target=1, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the probability distribution of the categories in the target column. Only for classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. target: int or str, optional (default=1) Probability of being that category in the target column as index or name. Only for multiclass classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_probabilities"},{"location":"API/plots/plot_probabilities/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y='RainTomorrow') atom.run('rf') atom.plot_probabilities(target='Yes', filenmae='probabilities_category_yes')","title":"Example"},{"location":"API/plots/plot_residuals/","text":"plot_residuals method plot_residuals (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line shows the identity line. This plot can be useful to analyze the variance of the error of the regressor. If the points are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. Only for regression tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['OLS', 'LGB'], metric='MAE') atom.plot_residuals()","title":"plot_residuals"},{"location":"API/plots/plot_residuals/#plot_residuals","text":"method plot_residuals (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] The plot shows the residuals (difference between the predicted and the true value) on the vertical axis and the independent variable on the horizontal axis. The gray, intersected line shows the identity line. This plot can be useful to analyze the variance of the error of the regressor. If the points are randomly dispersed around the horizontal axis, a linear regression model is appropriate for the data; otherwise, a non-linear model is more appropriate. Only for regression tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_residuals"},{"location":"API/plots/plot_residuals/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run(['OLS', 'LGB'], metric='MAE') atom.plot_residuals()","title":"Example"},{"location":"API/plots/plot_rfecv/","text":"plot_rfecv method plot_rfecv (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the RFECV results, i.e. the scores obtained by the estimator fitted on every subset of the dataset. Only available if RFECV was applied on the data. Parameters: title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='RFECV', solver='LGB', scoring='precision') atom.plot_rfecv()","title":"plot_rfecv"},{"location":"API/plots/plot_rfecv/#plot_rfecv","text":"method plot_rfecv (title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the RFECV results, i.e. the scores obtained by the estimator fitted on every subset of the dataset. Only available if RFECV was applied on the data. Parameters: title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_rfecv"},{"location":"API/plots/plot_rfecv/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.feature_selection(strategy='RFECV', solver='LGB', scoring='precision') atom.plot_rfecv()","title":"Example"},{"location":"API/plots/plot_roc/","text":"plot_roc method plot_roc (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the Receiver Operating Characteristics curve. The legend shows the Area Under the ROC Curve (AUC) score. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF', 'LGB'], metric='roc_auc') atom.plot_roc(filename='roc_curve.png')","title":"plot_roc"},{"location":"API/plots/plot_roc/#plot_roc","text":"method plot_roc (models=None, dataset='test', title=None, figsize=(10, 6), filename=None, display=True) [source] Plot the Receiver Operating Characteristics curve. The legend shows the Area Under the ROC Curve (AUC) score. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_roc"},{"location":"API/plots/plot_roc/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['LR', 'RF', 'LGB'], metric='roc_auc') atom.plot_roc(filename='roc_curve.png')","title":"Example"},{"location":"API/plots/plot_successive_halving/","text":"plot_successive_halving method plot_successive_halving (models=None, metric=0, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot of the models' scores per iteration of the successive halving. Only available if the models were fitted using successive halving . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.successive_halving(['tree', 'bag', 'adab', 'et', 'rf', 'gbm', 'xgb', 'lgb'], metric='mse') atom.plot_successive_halving()","title":"plot_successive_halving"},{"location":"API/plots/plot_successive_halving/#plot_successive_halving","text":"method plot_successive_halving (models=None, metric=0, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot of the models' scores per iteration of the successive halving. Only available if the models were fitted using successive halving . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all the models in the pipeline are selected. metric: int or str, optional (default=0) Index or name of the metric to plot. Only for multi-metric runs. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_successive_halving"},{"location":"API/plots/plot_successive_halving/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.successive_halving(['tree', 'bag', 'adab', 'et', 'rf', 'gbm', 'xgb', 'lgb'], metric='mse') atom.plot_successive_halving()","title":"Example"},{"location":"API/plots/plot_threshold/","text":"plot_threshold method plot_threshold (models=None, metric=None, dataset='test', steps=100, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot a metric's performance against threshold values. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. metric: str, callable, sequence or None, optional (default=None) Metric(s) to plot. These can be one of sklearn's pre-defined scorers, a metric function or a sklearn scorer object (see the user guide ). If None, the metric used to run the pipeline is used. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. steps: int, optional (default=100) Number of thresholds measured. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. Example from atom import ATOMClassifier from sklearn.metrics import recall_score atom = ATOMClassifier(X, y) atom.run('LGB') atom.plot_threshold(metric=['accuracy', 'f1', recall_score])","title":"plot_threshold"},{"location":"API/plots/plot_threshold/#plot_threshold","text":"method plot_threshold (models=None, metric=None, dataset='test', steps=100, title=None, figsize=(10, 6), filename=None, display=True) [source] Plot a metric's performance against threshold values. Only for binary classification tasks. Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. metric: str, callable, sequence or None, optional (default=None) Metric(s) to plot. These can be one of sklearn's pre-defined scorers, a metric function or a sklearn scorer object (see the user guide ). If None, the metric used to run the pipeline is used. dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train', 'test' or 'both'. steps: int, optional (default=100) Number of thresholds measured. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple, optional (default=(10, 6)) Figure's size, format as (x, y). filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot.","title":"plot_threshold"},{"location":"API/plots/plot_threshold/#example","text":"from atom import ATOMClassifier from sklearn.metrics import recall_score atom = ATOMClassifier(X, y) atom.run('LGB') atom.plot_threshold(metric=['accuracy', 'f1', recall_score])","title":"Example"},{"location":"API/plots/summary_plot/","text":"summary_plot method summary_plot (models=None, show=None, target=1, title=None, figsize=None, filename=None, display=True, **kwargs) [source] Plot SHAP's summary plot. Create a SHAP beeswarm plot, colored by feature values when they are provided. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . show: int or None, optional (default=None) Number of features to show in the plot. None to show all. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's summary_plot. Example from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.summary_plot(show=11)","title":"summary_plot"},{"location":"API/plots/summary_plot/#summary_plot","text":"method summary_plot (models=None, show=None, target=1, title=None, figsize=None, filename=None, display=True, **kwargs) [source] Plot SHAP's summary plot. Create a SHAP beeswarm plot, colored by feature values when they are provided. The explainer will be chosen automatically based on the model's type. Read more about SHAP plots in the user guide . Parameters: models: str, sequence or None, optional (default=None) Name of the models to plot. If None, all models in the pipeline are selected. Note that selecting multiple models will raise an exception. To avoid this, call the plot from a model . show: int or None, optional (default=None) Number of features to show in the plot. None to show all. target: int or str, optional (default=1) Category to look at in the target class as index or name. Only for multi-class classification tasks. title: str or None, optional (default=None) Plot's title. If None, the default option is used. figsize: tuple or None, optional (default=None) Figure's size, format as (x, y). If None, adapts size to show parameter. filename: str or None, optional (default=None) Name of the file (to save). If None, the figure is not saved. display: bool, optional (default=True) Whether to render the plot. **kwargs Additional keyword arguments for shap's summary_plot.","title":"summary_plot"},{"location":"API/plots/summary_plot/#example","text":"from atom import ATOMRegressor atom = ATOMRegressor(X, y) atom.run('RF') atom.summary_plot(show=11)","title":"Example"},{"location":"API/predicting/decision_function/","text":"decision_function method decision_function (X, verbose=None, **kwargs) [source] Transform the data and evaluate the decision function on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a decision_function method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run('kSVM', metric='accuracy') # Evaluate the decision function on new data predictions = atom.ksvm.decision_function(X_new)","title":"decision_function"},{"location":"API/predicting/decision_function/#decision_function","text":"method decision_function (X, verbose=None, **kwargs) [source] Transform the data and evaluate the decision function on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a decision_function method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations.","title":"decision_function"},{"location":"API/predicting/decision_function/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run('kSVM', metric='accuracy') # Evaluate the decision function on new data predictions = atom.ksvm.decision_function(X_new)","title":"Example"},{"location":"API/predicting/predict/","text":"predict method predict (X, verbose=None, **kwargs) [source] Transform the data and make predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict(X_new)","title":"predict"},{"location":"API/predicting/predict/#predict","text":"method predict (X, verbose=None, **kwargs) [source] Transform the data and make predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations.","title":"predict"},{"location":"API/predicting/predict/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict(X_new)","title":"Example"},{"location":"API/predicting/predict_log_proba/","text":"predict_log_proba method predict_log_proba (X, verbose=None, **kwargs) [source] Transform the data and make logarithmic probability predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict_proba method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict_log_proba(X_new)","title":"predict_log_proba"},{"location":"API/predicting/predict_log_proba/#predict_log_proba","text":"method predict_log_proba (X, verbose=None, **kwargs) [source] Transform the data and make logarithmic probability predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict_proba method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations.","title":"predict_log_proba"},{"location":"API/predicting/predict_log_proba/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict_log_proba(X_new)","title":"Example"},{"location":"API/predicting/predict_proba/","text":"predict_proba method predict_proba (X, verbose=None, **kwargs) [source] Transform the data and make probabilistic predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict_proba method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict_proba(X_new)","title":"predict_proba"},{"location":"API/predicting/predict_proba/#predict_proba","text":"method predict_proba (X, verbose=None, **kwargs) [source] Transform the data and make probabilistic predictions on new data. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a predict_proba method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations.","title":"predict_proba"},{"location":"API/predicting/predict_proba/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['Tree', 'AdaB'], metric='AP', n_calls=10) # Make predictions on new data predictions = atom.adab.predict_proba(X_new)","title":"Example"},{"location":"API/predicting/score/","text":"score method score (X, y, verbose=None, **kwargs) [source] Transform the data and return the model's score on new data. The score is a default evaluation criterion for the problem the estimator is designed to solve, defined by the estimator's package. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a score method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Position of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Note The returned metric is determined by each estimator's score method pre-defined by its respective package. See its corresponding documentation for further details. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['MNB', 'KNN', 'kSVM'], metric='precision') # Get the mean accuracy on new data predictions = atom.kSVM.score(X_new, y_new)","title":"score"},{"location":"API/predicting/score/#score","text":"method score (X, y, verbose=None, **kwargs) [source] Transform the data and return the model's score on new data. The score is a default evaluation criterion for the problem the estimator is designed to solve, defined by the estimator's package. If called from a training instance, it will use the best model in the pipeline (under the winner attribute). If called from a model , it will use that model. The estimator must have a score method. Parameters: X: dict, sequence, np.array or pd.DataFrame Data containing the features, with shape=(n_samples, n_features). y: int, str, sequence, np.array or pd.Series If int: Position of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Same keyword arguments as the transform method to include/exclude transformers from the transformations. Note The returned metric is determined by each estimator's score method pre-defined by its respective package. See its corresponding documentation for further details.","title":"score"},{"location":"API/predicting/score/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.run(['MNB', 'KNN', 'kSVM'], metric='precision') # Get the mean accuracy on new data predictions = atom.kSVM.score(X_new, y_new)","title":"Example"},{"location":"API/predicting/transform/","text":"transform method transform (X, y=None, verbose=None, **kwargs) [source] Transform new data through all the pre-processing steps in the pipeline. By default, all transformers are included except outliers and balance since they should only be applied on the training set. Can only be called from atom . Parameters: X: dict, sequence, np.array or pd.DataFrame Features to transform, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformers. If int: Position of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Additional keyword arguments to customize which transformers to apply. You can either select them including their index in the pipeline parameter, e.g. pipeline=[0, 1, 4] or include/exclude them individually using their methods, e.g. impute=True or feature_selection=False . Note When using the pipeline parameter to include/exclude transformers, remember that the first transformer (index 0) in atom 's pipeline is always the StandardCleaner called during initialization. Example from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='knn', strat_cat='drop') atom.outliers(strategy='min_max', max_sigma=2) atom.feature_generation(strategy='gfg', n_features=3, generations=10, population=1000) # Apply only the StandardCleaner and Imputer on new data X_transformed = atom.transform(X_new, pipeline=[0, 1])","title":"transform"},{"location":"API/predicting/transform/#transform","text":"method transform (X, y=None, verbose=None, **kwargs) [source] Transform new data through all the pre-processing steps in the pipeline. By default, all transformers are included except outliers and balance since they should only be applied on the training set. Can only be called from atom . Parameters: X: dict, sequence, np.array or pd.DataFrame Features to transform, with shape=(n_samples, n_features). y: int, str, sequence, np.array, pd.Series or None, optional (default=None) If None: y is ignored in the transformers. If int: Position of the target column in X. If string: Name of the target column in X. Else: Target column with shape=(n_samples,). verbose: int or None, optional (default=None) Verbosity level of the output. If None, it uses ATOM's verbosity. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. **kwargs Additional keyword arguments to customize which transformers to apply. You can either select them including their index in the pipeline parameter, e.g. pipeline=[0, 1, 4] or include/exclude them individually using their methods, e.g. impute=True or feature_selection=False . Note When using the pipeline parameter to include/exclude transformers, remember that the first transformer (index 0) in atom 's pipeline is always the StandardCleaner called during initialization.","title":"transform"},{"location":"API/predicting/transform/#example","text":"from atom import ATOMClassifier atom = ATOMClassifier(X, y) atom.impute(strat_num='knn', strat_cat='drop') atom.outliers(strategy='min_max', max_sigma=2) atom.feature_generation(strategy='gfg', n_features=3, generations=10, population=1000) # Apply only the StandardCleaner and Imputer on new data X_transformed = atom.transform(X_new, pipeline=[0, 1])","title":"Example"},{"location":"API/training/successivehalvingclassifier/","text":"SuccessiveHalvingClassifier class atom.training. SuccessiveHalvingClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a successive halving fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the complete training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the SuccessiveHalvingClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. skip_iter: int, optional (default=0) Skip last skip_iter iterations of the successive halving. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use default name (SuccessiveHalvingClassifier). save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: SuccessiveHalvingClassifier Estimator instance. Example from atom.training import SuccessiveHalvingClassifier # Run the pipeline trainer = SuccessiveHalvingClassifier(['Tree', 'Bag', 'RF', 'ET'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_successive_halving()","title":"SuccessiveHalvingClassifier"},{"location":"API/training/successivehalvingclassifier/#successivehalvingclassifier","text":"class atom.training. SuccessiveHalvingClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a successive halving fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the complete training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the SuccessiveHalvingClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. skip_iter: int, optional (default=0) Skip last skip_iter iterations of the successive halving. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"SuccessiveHalvingClassifier"},{"location":"API/training/successivehalvingclassifier/#attributes","text":"","title":"Attributes"},{"location":"API/training/successivehalvingclassifier/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column.","title":"Data attributes"},{"location":"API/training/successivehalvingclassifier/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/successivehalvingclassifier/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/successivehalvingclassifier/#methods","text":"calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use default name (SuccessiveHalvingClassifier). save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: SuccessiveHalvingClassifier Estimator instance.","title":"Methods"},{"location":"API/training/successivehalvingclassifier/#example","text":"from atom.training import SuccessiveHalvingClassifier # Run the pipeline trainer = SuccessiveHalvingClassifier(['Tree', 'Bag', 'RF', 'ET'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_successive_halving()","title":"Example"},{"location":"API/training/successivehalvingregressor/","text":"SuccessiveHalvingRegressor class atom.training. SuccessiveHalvingRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a successive halving fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the complete training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the SuccessiveHalvingRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. skip_iter: int, optional (default=0) Skip last skip_iter iterations of the successive halving. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance. Example from atom.training import SuccessiveHalvingRegressor # Run the pipeline trainer = SuccessiveHalvingRegressor(['Tree', 'Bag', 'RF', 'ET'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_successive_halving()","title":"SuccessiveHalvingClassifier"},{"location":"API/training/successivehalvingregressor/#successivehalvingregressor","text":"class atom.training. SuccessiveHalvingRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, skip_iter=0, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a successive halving fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the complete training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the SuccessiveHalvingRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. skip_iter: int, optional (default=0) Skip last skip_iter iterations of the successive halving. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"SuccessiveHalvingRegressor"},{"location":"API/training/successivehalvingregressor/#attributes","text":"","title":"Attributes"},{"location":"API/training/successivehalvingregressor/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column.","title":"Data attributes"},{"location":"API/training/successivehalvingregressor/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/successivehalvingregressor/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/successivehalvingregressor/#methods","text":"clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance.","title":"Methods"},{"location":"API/training/successivehalvingregressor/#example","text":"from atom.training import SuccessiveHalvingRegressor # Run the pipeline trainer = SuccessiveHalvingRegressor(['Tree', 'Bag', 'RF', 'ET'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_successive_halving()","title":"Example"},{"location":"API/training/trainerclassifier/","text":"TrainerClassifier class atom.training. TrainerClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluates the models to the data in the pipeline. The following steps are applied: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainerClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerClassifier Estimator instance. Example from atom.training import TrainerClassifier # Run the pipeline trainer = TrainerClassifier(['Tree', 'RF'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.scoring('auc') trainer.Tree.plot_bo()","title":"TrainerClassifier"},{"location":"API/training/trainerclassifier/#trainerclassifier","text":"class atom.training. TrainerClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluates the models to the data in the pipeline. The following steps are applied: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainerClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"TrainerClassifier"},{"location":"API/training/trainerclassifier/#attributes","text":"","title":"Attributes"},{"location":"API/training/trainerclassifier/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column.","title":"Data attributes"},{"location":"API/training/trainerclassifier/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/trainerclassifier/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/trainerclassifier/#methods","text":"calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerClassifier Estimator instance.","title":"Methods"},{"location":"API/training/trainerclassifier/#example","text":"from atom.training import TrainerClassifier # Run the pipeline trainer = TrainerClassifier(['Tree', 'RF'], n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.scoring('auc') trainer.Tree.plot_bo()","title":"Example"},{"location":"API/training/trainerregressor/","text":"TrainerRegressor class atom.training. TrainerRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluates the models to the data in the pipeline. The following steps are applied: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainerRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance. Example from atom.training import TrainerRegressor # Run the pipeline trainer = TrainerRegressor(['OLS', 'BR'], n_calls=5, n_initial_points=3, bagging=5) trainer.run(train, test) # Analyze the results trainer.scoring('mse') trainer.plot_bagging()","title":"TrainerRegressor"},{"location":"API/training/trainerregressor/#trainerregressor","text":"class atom.training. TrainerRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluates the models to the data in the pipeline. The following steps are applied: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainerRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"TrainerRegressor"},{"location":"API/training/trainerregressor/#attributes","text":"","title":"Attributes"},{"location":"API/training/trainerregressor/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column.","title":"Data attributes"},{"location":"API/training/trainerregressor/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/trainerregressor/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/trainerregressor/#methods","text":"clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance.","title":"Methods"},{"location":"API/training/trainerregressor/#example","text":"from atom.training import TrainerRegressor # Run the pipeline trainer = TrainerRegressor(['OLS', 'BR'], n_calls=5, n_initial_points=3, bagging=5) trainer.run(train, test) # Analyze the results trainer.scoring('mse') trainer.plot_bagging()","title":"Example"},{"location":"API/training/trainsizingclassifier/","text":"TrainSizingClassifier class atom.training. TrainSizingClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a train sizing fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainSizingClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. train_sizes: sequence, optional (default=np.linspace(0.2, 1.0, 5)) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the value is <=1, it is interpreted as a fraction of the maximum size of the training set. If the value is > 1, it is interpreted as the total number of samples in the set. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use default name (TrainSizingClassifier). save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainSizingClassifier Estimator instance. Example from atom.training import TrainSizingClassifier # Run the pipeline trainer = TrainSizingClassifier('RF', n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_learning_curve()","title":"TrainSizingClassifier"},{"location":"API/training/trainsizingclassifier/#trainsizingclassifier","text":"class atom.training. TrainSizingClassifier (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a train sizing fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainSizingClassifier instance. Read more in the user guide . Parameters: models: str or sequence Models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'GNB' for Gaussian Naive Bayes (no hyperparameter tuning) 'MNB' for Multinomial Naive Bayes 'BNB' for Bernoulli Naive Bayes 'Ridge' for Ridge Linear Classification 'LR' for Logistic Regression 'LDA' for Linear Discriminant Analysis 'QDA' for Quadratic Discriminant Analysis 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. train_sizes: sequence, optional (default=np.linspace(0.2, 1.0, 5)) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the value is <=1, it is interpreted as a fraction of the maximum size of the training set. If the value is > 1, it is interpreted as the total number of samples in the set. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"TrainSizingClassifier"},{"location":"API/training/trainsizingclassifier/#attributes","text":"","title":"Attributes"},{"location":"API/training/trainsizingclassifier/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. categories: list Sorted list of the unique categories in the target column. n_categories: int Number of unique categories in the target column.","title":"Data attributes"},{"location":"API/training/trainsizingclassifier/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/trainsizingclassifier/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/trainsizingclassifier/#methods","text":"calibrate Calibrate the winning model. clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method calibrate (**kwargs) [source] Applies probability calibration on the winning model. The calibration is done with the CalibratedClassifierCV class from sklearn. The model will be trained via cross-validation on a subset of the training data, using the rest to fit the calibrator. The new classifier will replace the estimator attribute. After calibrating, all prediction attributes of the winning model will reset. Parameters: **kwargs Additional keyword arguments for the CalibratedClassifierCV instance. Using cv='prefit' will use the trained model and fit the calibrator on the test set. Note that doing this will result in data leakage in the test set. Use this only if you have another, independent set for testing. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use default name (TrainSizingClassifier). save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS or one of the following custom metrics: 'cm' or 'confusion_matrix' for an array of the confusion matrix. 'tn' for true negatives. 'fp' for false positives. 'fn' for false negatives. 'lift' for the lift metric. 'fpr' for the false positive rate. 'tpr' for true positive rate. 'sup' for the support metric. If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainSizingClassifier Estimator instance.","title":"Methods"},{"location":"API/training/trainsizingclassifier/#example","text":"from atom.training import TrainSizingClassifier # Run the pipeline trainer = TrainSizingClassifier('RF', n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_learning_curve()","title":"Example"},{"location":"API/training/trainsizingregressor/","text":"TrainSizingRegressor class atom.training. TrainSizingRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a train sizing fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainSizingRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. train_sizes: sequence, optional (default=np.linspace(0.2, 1.0, 5)) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the value is <=1, it is interpreted as a fraction of the maximum size of the training set. If the value is > 1, it is interpreted as the total number of samples in the set. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random . Attributes Data attributes The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column. Utility attributes Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run. Plot attributes Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes. Methods clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance. Example from atom.training import TrainSizingRegressor # Run the pipeline trainer = TrainSizingRegressor('RF', n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_learning_curve()","title":"TrainSizingRegressor"},{"location":"API/training/trainsizingregressor/#trainsizingregressor","text":"class atom.training. TrainSizingRegressor (models, metric=None, greater_is_better=True, needs_proba=False, needs_threshold=False, train_sizes=np.linspace(0.2, 1.0, 5), n_calls=10, n_random_points=5, bo_params={}, bagging=None, n_jobs=1, verbose=0, logger=None, random_state=None) [source] Fit and evaluate the models in a train sizing fashion. The pipeline applies the following steps per iteration: The optimal hyperparameters are selected using a bayesian optimization algorithm. The model is fitted on the training set using the best combinations of hyperparameters found. Using a bagging algorithm, various scores on the test set are calculated. Just like atom , you can predict , plot and call any model from the TrainSizingRegressor instance. Read more in the user guide . Parameters: models: str or sequence List of models to fit on the data. Use the predefined acronyms to select the models. Possible values are (case insensitive): 'GP' for Gaussian Process (no hyperparameter tuning) 'OLS' for Ordinary Least Squares (no hyperparameter tuning) 'Ridge' for Ridge Linear Regression 'Lasso' for Lasso Linear Regression 'EN' for ElasticNet Linear Regression 'BR' for Bayesian Regression (uses ridge regularization) 'KNN' for K-Nearest Neighbors 'Tree' for a single Decision Tree 'Bag' for Bagging (uses a decision tree as base estimator) 'ET' for Extra-Trees 'RF' for Random Forest 'AdaB' for AdaBoost (uses a decision tree as base estimator) 'GBM' for Gradient Boosting Machine 'XGB' for XGBoost (only available if package is installed) 'LGB' for LightGBM (only available if package is installed) 'CatB' for CatBoost (only available if package is installed) 'lSVM' for Linear Support Vector Machine (uses a one-vs-rest strategy for multiclass classification) 'kSVM' for Kernel Support Vector Machine (uses a one-vs-one strategy for multiclass classification) 'PA' for Passive Aggressive 'SGD' for Stochastic Gradient Descent 'MLP' for Multilayer Perceptron (can have between one and three hidden layers) metric: str, callable or sequence, optional (default=None) Metric(s) on which the pipeline fits the models. Choose from any of sklearn's predefined scorers , use a score (or loss) function with signature metric(y, y_pred, **kwargs) or use a scorer object. If multiple metrics are selected, only the first will be used to optimize the BO. If None, a default metric is selected: 'f1' for binary classification 'f1_weighted' for multiclass classification 'r2' for regression greater_is_better: bool or sequence, optional (default=True) Whether the metric is a score function or a loss function, i.e. if True, a higher score is better and if False, lower is better. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_proba: bool or sequence, optional (default=False) Whether the metric function requires probability estimates out of a classifier. If True, make sure that every model in the pipeline has a predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. needs_threshold: bool or sequence, optional (default=False) Whether the metric function takes a continuous decision certainty. This only works for binary classification using estimators that have either a decision_function or predict_proba method. Will be ignored if the metric is a string or a scorer. If sequence, the n-th value will apply to the n-th metric in the pipeline. train_sizes: sequence, optional (default=np.linspace(0.2, 1.0, 5)) Relative or absolute numbers of training examples that will be used to generate the learning curve. If the value is <=1, it is interpreted as a fraction of the maximum size of the training set. If the value is > 1, it is interpreted as the total number of samples in the set. n_calls: int or sequence, optional (default=0) Maximum number of iterations of the BO (including n_initial_points ). If 0, skip the BO and fit the model on its default parameters. If sequence, the n-th value will apply to the n-th model in the pipeline. n_initial_points: int or sequence, optional (default=5) Initial number of random tests of the BO before fitting the surrogate function. If equal to n_calls , the optimizer will technically be performing a random search. If sequence, the n-th value will apply to the n-th model in the pipeline. bo_params: dict, optional (default={}) Dictionary of extra keyword arguments for the BO. These can include: base_estimator: str, optional (default='GP') Base estimator to use in the BO. Choose from: 'GP' for Gaussian Process 'RF' for Random Forest 'ET' for Extra-Trees 'GBRT' for Gradient Boosted Regression Trees max_time: int, optional (default=np.inf) Stop the optimization after max_time seconds. delta_x: int or float, optional (default=0) Stop the optimization when |x1 - x2| < delta_x . delta_y: int or float, optional (default=0) Stop the optimization if the 5 minima are within delta_y . cv: int, optional (default=5) Number of folds for the cross-validation. If 1, the training set will be randomly split in a subtrain and validation set. early stopping: int, float or None, optional (default=None) Training will stop if the model didn't improve in last early_stopping rounds. If <1, fraction of rounds from the total. If None, no early stopping is performed. Only available for models that allow in-training evaluation. callback: callable or list of callables, optional (default=None) Callbacks for the BO. dimensions: dict, array or None, optional (default=None) Custom hyperparameter space for the bayesian optimization. Can be an array (only if there is 1 model in the pipeline) or a dictionary with the model's name as key. If None, ATOM's predefined dimensions are used. plot_bo: bool, optional (default=False) Whether to plot the BO's progress as it runs. Creates a canvas with two plots: the first plot shows the score of every trial and the second shows the distance between the last consecutive steps. Don't forget to call %matplotlib at the start of the cell if you are using an interactive notebook! Additional keyword argument for skopt's optimizer. bagging: int or None, optional (default=None) Number of data sets (bootstrapped from the training set) to use in the bagging algorithm. If None or 0, no bagging is performed. If sequence, the n-th value will apply to the n-th model in the pipeline. n_jobs: int, optional (default=1) Number of cores to use for parallel processing. If >0: Number of cores to use. If -1: Use all available cores. If <-1: Use available_cores - 1 + n_jobs. Beware that using multiple processes on the same machine may cause memory issues for large datasets. verbose: int, optional (default=0) Verbosity level of the class. Possible values are: 0 to not print anything. 1 to print basic information. 2 to print detailed information. logger: bool, str, class or None, optional (default=None) If None: Doesn't save a logging file. If bool: True for logging file with default name. False for no logger. If str: Name of the logging file. 'auto' to create an automatic name. If class: python Logger object. random_state: int or None, optional (default=None) Seed used by the random number generator. If None, the random number generator is the RandomState instance used by numpy.random .","title":"TrainSizingRegressor"},{"location":"API/training/trainsizingregressor/#attributes","text":"","title":"Attributes"},{"location":"API/training/trainsizingregressor/#data-attributes","text":"The instance's dataset can be accessed through multiple attributes, e.g. calling trainer.train will return the training set. The data can also be changed through these attributes, e.g. trainer.test = trainer.test.drop(0) will drop the first row from the test set. Doing this will automatically update the other data attributes. Attributes: dataset: pd.DataFrame Complete dataset in the pipeline. train: pd.DataFrame Training set. test: pd.DataFrame Test set. X: pd.DataFrame Feature set. y: pd.Series Target column. X_train: pd.DataFrame Training features. y_train: pd.Series Training target. X_test: pd.DataFrame Test features. y_test: pd.Series Test target. shape: tuple Dataset's shape in the form (rows x columns). columns: list List of columns in the dataset. target: str Name of the target column.","title":"Data attributes"},{"location":"API/training/trainsizingregressor/#utility-attributes","text":"Attributes: models: list List of models in the pipeline. metric: str or list Metric(s) used to fit the models. errors: dict Dictionary of the encountered exceptions (if any). winner: model Model subclass that performed best on the test set. results: pd.DataFrame Dataframe of the training results with the model acronyms as index. Columns can include: name: Name of the model. metric_bo: Best score achieved during the BO. time_bo: Time spent on the BO. metric_train: Metric score on the training set. metric_test: Metric score on the test set. time_fit: Time spent fitting and evaluating. mean_bagging: Mean score of the bagging's results. std_bagging: Standard deviation score of the bagging's results. time_bagging: Time spent on the bagging algorithm. time: Total time spent on the whole run.","title":"Utility attributes"},{"location":"API/training/trainsizingregressor/#plot-attributes","text":"Attributes: style: str Plotting style. See seaborn's documentation . palette: str Color palette. See seaborn's documentation . title_fontsize: int Fontsize for the plot's title. label_fontsize: int Fontsize for labels and legends. tick_fontsize: int Fontsize for the ticks along the plot's axes.","title":"Plot attributes"},{"location":"API/training/trainsizingregressor/#methods","text":"clear Remove a model from the pipeline. get_params Get parameters for this estimator. log Save information to the logger and print to stdout. run Fit and evaluate the models. save Save the instance to a pickle file. scoring Print the scoring of the models for a specific metric. set_params Set the parameters of this estimator. method clear (models='all') [source] Removes all traces of a model in the pipeline (except for the errors attribute). If all models in the pipeline are removed, the metric is reset. Use this method to remove unwanted models from the pipeline or to clear memory before saving the instance. Parameters: models: str or sequence, optional (default='all') Model(s) to clear from the pipeline. If 'all', clear all models. method get_params (deep=True) [source] Get parameters for this estimator. Parameters: deep: bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. Returns: params: dict Dictionary of the parameter names mapped to their values. method log (msg, level=0) [source] Write a message to the logger and print it to stdout. Parameters: msg: str Message to write to the logger and print to stdout. level: int, optional (default=0) Minimum verbosity level in order to print the message. method run (*arrays) [source] Fit and evaluate the models. Parameters: *arrays: array-like Either a train and test set or X_train, X_test, y_train, y_test. method save (filename=None, save_data=True) [source] Save the instance to a pickle file. Remember that the class contains the complete dataset as property, so the file can become large for big datasets! To avoid this, use save_data=False . Parameters: filename: str or None, optional (default=None) Name to save the file with. If None or 'auto', use the name of the class. save_data: bool, optional (default=True) Whether to save the data as an attribute of the instance. If False, remember to update the data immediately after loading the pickle using the dataset's @setter . method scoring (metric=None, dataset='test') [source] Print the scoring of the models for a specific metric. If a model shows a XXX , it means the metric failed for that specific model. This can happen if either the metric is unavailable for the task or if the model does not have a predict_proba method while the metric requires it. Parameters: metric: str or None, optional (default=None) Name of the metric to calculate. Choose from any of sklearn's SCORERS . If None, returns the final results for the model (ignores the dataset parameter). dataset: str, optional (default='test') Data set on which to calculate the metric. Options are 'train' or 'test'. method set_params (**params) [source] Set the parameters of this estimator. Parameters: **params: dict Estimator parameters. Returns: self: TrainerRegressor Estimator instance.","title":"Methods"},{"location":"API/training/trainsizingregressor/#example","text":"from atom.training import TrainSizingRegressor # Run the pipeline trainer = TrainSizingRegressor('RF', n_calls=5, n_initial_points=3) trainer.run(train, test) # Analyze the results trainer.plot_learning_curve()","title":"Example"},{"location":"examples/binary_classification/binary_classification/","text":"Binary classification This example shows how we can use ATOM to perform a variety of data cleaning steps in order to prepare the data for modelling. Then, we compare the prediction performance of an Extra-Trees and a Random Forest. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow . Load the data # Import packages import pandas as pd from sklearn.metrics import fbeta_score from atom import ATOMClassifier # Load data X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 135379 AliceSprings 22.4 35.4 0.0 4.8 11.2 ESE 33.0 55572 Ballarat 11.7 19.8 0.0 NaN NaN NNE 48.0 111664 Witchcliffe 3.9 15.4 5.6 NaN NaN NW 43.0 6661 Cobar 21.6 34.9 0.0 11.2 NaN NNE 41.0 78634 Watsonia 13.6 33.3 0.0 8.0 12.3 N 37.0 Run the pipeline # Call ATOM using only 5% of the complete dataset (for explanatory purposes) atom = ATOMClassifier(X, y='RainTomorrow', n_rows=0.05, n_jobs=8, warnings=False, verbose=2, random_state=1) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 8 cores. Applying data cleaning... Dataset stats ================= >> Shape: (7110, 22) Missing values: 15896 Categorical columns: 5 Scaled: False ---------------------------------- Train set size: 5688 Test set size: 1422 ---------------------------------- Train set balance: No:Yes <==> 3.7:1.0 Test set balance: No:Yes <==> 4.1:1.0 ---------------------------------- Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 5615 | 4473 | 1142 | | 1: Yes | 1495 | 1215 | 280 | # We can change the data attributes in between the pipeline # Note that we can only replace it with a new dataframe! atom.X = atom.X.assign(AvgTemp=(atom.X['MaxTemp'] + atom.X['MinTemp'])/2) # This will automatically update all other data attributes assert 'AvgTemp' in atom.dataset # Impute missing values atom.impute(strat_num='knn', strat_cat='drop', min_frac_rows=0.8) Fitting Imputer... Imputing missing values... --> Dropping 778 rows for containing less than 80% non-missing values. --> Imputing 5 missing values using the KNN imputer in feature MinTemp. --> Imputing 3 missing values using the KNN imputer in feature MaxTemp. --> Imputing 31 missing values using the KNN imputer in feature Rainfall. --> Imputing 2314 missing values using the KNN imputer in feature Evaporation. --> Imputing 2645 missing values using the KNN imputer in feature Sunshine. --> Dropping 201 rows due to missing values in feature WindGustDir. --> Dropping 358 rows due to missing values in feature WindDir9am. --> Dropping 15 rows due to missing values in feature WindDir3pm. --> Imputing 17 missing values using the KNN imputer in feature Humidity9am. --> Imputing 52 missing values using the KNN imputer in feature Humidity3pm. --> Imputing 37 missing values using the KNN imputer in feature Pressure9am. --> Imputing 34 missing values using the KNN imputer in feature Pressure3pm. --> Imputing 1891 missing values using the KNN imputer in feature Cloud9am. --> Imputing 1977 missing values using the KNN imputer in feature Cloud3pm. --> Imputing 4 missing values using the KNN imputer in feature Temp9am. --> Imputing 31 missing values using the KNN imputer in feature Temp3pm. --> Dropping 30 rows due to missing values in feature RainToday. --> Imputing 4 missing values using the KNN imputer in feature AvgTemp. # Encode the categorical features atom.encode(strategy='CatBoost', max_onehot=10, frac_to_other=0.04) Fitting Encoder... Encoding categorical columns... --> CatBoost-encoding feature Location. Contains 1 unique categories. --> CatBoost-encoding feature WindGustDir. Contains 16 unique categories. --> CatBoost-encoding feature WindDir9am. Contains 16 unique categories. --> CatBoost-encoding feature WindDir3pm. Contains 16 unique categories. --> Label-encoding feature RainToday. Contains 2 unique categories. # Perform undersampling of the majority class atom.balance(strategy='smote', sampling_strategy=0.9) atom.stats() # Note the balanced training set Oversampling with SMOTE... --> Adding 2302 rows to category: Yes. Dataset stats ================= >> Shape: (8030, 23) Scaled: False ---------------------------------- Train set size: 6885 Test set size: 1145 ---------------------------------- Train set balance: No:Yes <==> 1.1:1.0 Test set balance: No:Yes <==> 4.1:1.0 ---------------------------------- Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 4543 | 3624 | 919 | | 1: Yes | 3487 | 3261 | 226 | # Define a custom metric def f2_score(y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) # Fit the EXtra-Trees and Random Forest to the data atom.run(models=['et', 'rf'], metric=f2_score, n_calls=0, bagging=5, verbose=1) Running pipeline ============================= >> Models in pipeline: ET, RF Metric: f2_score Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> f2_score: 1.0000 Score on the test set --> f2_score: 0.5474 Time elapsed: 0.191s Bagging ----------------------------------------- Score --> f2_score: 0.6027 \u00b1 0.0190 Time elapsed: 0.843s ------------------------------------------------- Total time: 1.038s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> f2_score: 1.0000 Score on the test set --> f2_score: 0.5959 Time elapsed: 0.295s Bagging ----------------------------------------- Score --> f2_score: 0.6087 \u00b1 0.0113 Time elapsed: 1.291s ------------------------------------------------- Total time: 1.589s Final results ========================= >> Duration: 2.627s ------------------------------------------ Extra-Trees --> f2_score: 0.603 \u00b1 0.019 ~ Random Forest --> f2_score: 0.609 \u00b1 0.011 ~ ! Analyze the results # Let's have a look at the final scoring atom.scoring() # The winning model is indicated with a ! and can be accessed through the winner attribute # The ~ indicates that the model is probably overfitting. If we look at the train and test # score we see a difference of more than 20% print(f'\\n\\nAnd the winner is the {atom.winner.longname} model!!') print('Score on the training set: ', atom.winner.metric_train) print('Score on the test set: ', atom.winner.metric_test) Results ===================== >> Extra-Trees --> f2_score: 0.603 \u00b1 0.019 ~ Random Forest --> f2_score: 0.609 \u00b1 0.011 ~ And the winner is the Random Forest model!! Score on the training set: 1.0 Score on the test set: 0.5958781362007168 We can make many plots to check the performance of the models # The probabilties plot shows the distribution of predicted # probabilities for the positive class atom.winner.plot_probabilities() # The threshold plot let us compare how different metrics # perform for different thresholds atom.winner.plot_threshold(metric=['f1', 'accuracy', 'average_precision'], steps=50, filename='thresholds.png') # The ROC and PRC curve are also typical ways of measuring performance atom.plot_roc(title=\"ROC for the LightGBM vs CatBoost model\") atom.plot_prc(title=\"PRC comparison of the models\")","title":"Binary classification"},{"location":"examples/binary_classification/binary_classification/#binary-classification","text":"This example shows how we can use ATOM to perform a variety of data cleaning steps in order to prepare the data for modelling. Then, we compare the prediction performance of an Extra-Trees and a Random Forest. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow .","title":"Binary classification"},{"location":"examples/binary_classification/binary_classification/#load-the-data","text":"# Import packages import pandas as pd from sklearn.metrics import fbeta_score from atom import ATOMClassifier # Load data X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 135379 AliceSprings 22.4 35.4 0.0 4.8 11.2 ESE 33.0 55572 Ballarat 11.7 19.8 0.0 NaN NaN NNE 48.0 111664 Witchcliffe 3.9 15.4 5.6 NaN NaN NW 43.0 6661 Cobar 21.6 34.9 0.0 11.2 NaN NNE 41.0 78634 Watsonia 13.6 33.3 0.0 8.0 12.3 N 37.0","title":"Load the data"},{"location":"examples/binary_classification/binary_classification/#run-the-pipeline","text":"# Call ATOM using only 5% of the complete dataset (for explanatory purposes) atom = ATOMClassifier(X, y='RainTomorrow', n_rows=0.05, n_jobs=8, warnings=False, verbose=2, random_state=1) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 8 cores. Applying data cleaning... Dataset stats ================= >> Shape: (7110, 22) Missing values: 15896 Categorical columns: 5 Scaled: False ---------------------------------- Train set size: 5688 Test set size: 1422 ---------------------------------- Train set balance: No:Yes <==> 3.7:1.0 Test set balance: No:Yes <==> 4.1:1.0 ---------------------------------- Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 5615 | 4473 | 1142 | | 1: Yes | 1495 | 1215 | 280 | # We can change the data attributes in between the pipeline # Note that we can only replace it with a new dataframe! atom.X = atom.X.assign(AvgTemp=(atom.X['MaxTemp'] + atom.X['MinTemp'])/2) # This will automatically update all other data attributes assert 'AvgTemp' in atom.dataset # Impute missing values atom.impute(strat_num='knn', strat_cat='drop', min_frac_rows=0.8) Fitting Imputer... Imputing missing values... --> Dropping 778 rows for containing less than 80% non-missing values. --> Imputing 5 missing values using the KNN imputer in feature MinTemp. --> Imputing 3 missing values using the KNN imputer in feature MaxTemp. --> Imputing 31 missing values using the KNN imputer in feature Rainfall. --> Imputing 2314 missing values using the KNN imputer in feature Evaporation. --> Imputing 2645 missing values using the KNN imputer in feature Sunshine. --> Dropping 201 rows due to missing values in feature WindGustDir. --> Dropping 358 rows due to missing values in feature WindDir9am. --> Dropping 15 rows due to missing values in feature WindDir3pm. --> Imputing 17 missing values using the KNN imputer in feature Humidity9am. --> Imputing 52 missing values using the KNN imputer in feature Humidity3pm. --> Imputing 37 missing values using the KNN imputer in feature Pressure9am. --> Imputing 34 missing values using the KNN imputer in feature Pressure3pm. --> Imputing 1891 missing values using the KNN imputer in feature Cloud9am. --> Imputing 1977 missing values using the KNN imputer in feature Cloud3pm. --> Imputing 4 missing values using the KNN imputer in feature Temp9am. --> Imputing 31 missing values using the KNN imputer in feature Temp3pm. --> Dropping 30 rows due to missing values in feature RainToday. --> Imputing 4 missing values using the KNN imputer in feature AvgTemp. # Encode the categorical features atom.encode(strategy='CatBoost', max_onehot=10, frac_to_other=0.04) Fitting Encoder... Encoding categorical columns... --> CatBoost-encoding feature Location. Contains 1 unique categories. --> CatBoost-encoding feature WindGustDir. Contains 16 unique categories. --> CatBoost-encoding feature WindDir9am. Contains 16 unique categories. --> CatBoost-encoding feature WindDir3pm. Contains 16 unique categories. --> Label-encoding feature RainToday. Contains 2 unique categories. # Perform undersampling of the majority class atom.balance(strategy='smote', sampling_strategy=0.9) atom.stats() # Note the balanced training set Oversampling with SMOTE... --> Adding 2302 rows to category: Yes. Dataset stats ================= >> Shape: (8030, 23) Scaled: False ---------------------------------- Train set size: 6885 Test set size: 1145 ---------------------------------- Train set balance: No:Yes <==> 1.1:1.0 Test set balance: No:Yes <==> 4.1:1.0 ---------------------------------- Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 4543 | 3624 | 919 | | 1: Yes | 3487 | 3261 | 226 | # Define a custom metric def f2_score(y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) # Fit the EXtra-Trees and Random Forest to the data atom.run(models=['et', 'rf'], metric=f2_score, n_calls=0, bagging=5, verbose=1) Running pipeline ============================= >> Models in pipeline: ET, RF Metric: f2_score Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> f2_score: 1.0000 Score on the test set --> f2_score: 0.5474 Time elapsed: 0.191s Bagging ----------------------------------------- Score --> f2_score: 0.6027 \u00b1 0.0190 Time elapsed: 0.843s ------------------------------------------------- Total time: 1.038s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> f2_score: 1.0000 Score on the test set --> f2_score: 0.5959 Time elapsed: 0.295s Bagging ----------------------------------------- Score --> f2_score: 0.6087 \u00b1 0.0113 Time elapsed: 1.291s ------------------------------------------------- Total time: 1.589s Final results ========================= >> Duration: 2.627s ------------------------------------------ Extra-Trees --> f2_score: 0.603 \u00b1 0.019 ~ Random Forest --> f2_score: 0.609 \u00b1 0.011 ~ !","title":"Run the pipeline"},{"location":"examples/binary_classification/binary_classification/#analyze-the-results","text":"# Let's have a look at the final scoring atom.scoring() # The winning model is indicated with a ! and can be accessed through the winner attribute # The ~ indicates that the model is probably overfitting. If we look at the train and test # score we see a difference of more than 20% print(f'\\n\\nAnd the winner is the {atom.winner.longname} model!!') print('Score on the training set: ', atom.winner.metric_train) print('Score on the test set: ', atom.winner.metric_test) Results ===================== >> Extra-Trees --> f2_score: 0.603 \u00b1 0.019 ~ Random Forest --> f2_score: 0.609 \u00b1 0.011 ~ And the winner is the Random Forest model!! Score on the training set: 1.0 Score on the test set: 0.5958781362007168 We can make many plots to check the performance of the models # The probabilties plot shows the distribution of predicted # probabilities for the positive class atom.winner.plot_probabilities() # The threshold plot let us compare how different metrics # perform for different thresholds atom.winner.plot_threshold(metric=['f1', 'accuracy', 'average_precision'], steps=50, filename='thresholds.png') # The ROC and PRC curve are also typical ways of measuring performance atom.plot_roc(title=\"ROC for the LightGBM vs CatBoost model\") atom.plot_prc(title=\"PRC comparison of the models\")","title":"Analyze the results"},{"location":"examples/calibration/calibration/","text":"Calibration This example shows us how to use the calibration method to calibrate a classifier. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow . Load the data # Import packages import pandas as pd from atom import ATOMClassifier # Get the dataset's features and targets X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 40667 Williamtown 10.0 20.4 0.0 5.4 NaN NW 48.0 43490 Wollongong 15.0 22.0 0.4 NaN NaN SSW 59.0 102419 Nuriootpa 2.6 23.9 0.0 8.0 12.8 ESE 35.0 123437 SalmonGums 3.4 18.0 0.0 NaN NaN WSW 33.0 18121 NorahHead 16.5 22.3 0.0 NaN NaN S 46.0 Run the pipeline # Initialize the ATOM class atom = ATOMClassifier(X, y='RainTomorrow', n_rows=1e4, verbose=1, warnings='ignore', random_state=1) # Handle missing values and categorical columns in the dataset atom.impute(strat_num='median', strat_cat='most_frequent') atom.encode(strategy='target', max_onehot=5, frac_to_other=0.05) # Fit a linear SVM to the data atom.run('lsvm') << ================== ATOM ================== >> Algorithm task: binary classification. Applying data cleaning... Dataset stats ================= >> Shape: (10000, 22) Missing values: 22613 Categorical columns: 5 Scaled: False ---------------------------------- Train set size: 8000 Test set size: 2000 Fitting Imputer... Imputing missing values... Fitting Encoder... Encoding categorical columns... Running pipeline ============================= >> Models in pipeline: lSVM Metric: f1 Results for Linear SVM: Fitting ----------------------------------------- Score on the train set --> f1: 0.5639 Score on the test set --> f1: 0.5929 Time elapsed: 0.444s ------------------------------------------------- Total time: 0.444s Final results ========================= >> Duration: 0.444s ------------------------------------------ Linear SVM --> f1: 0.593 Analyze the results # Check our model's calibration atom.plot_calibration() # Let's try to improve it using the calibrate method atom.calibrate(method='isotonic', cv=5) atom.plot_calibration()","title":"Calibration"},{"location":"examples/calibration/calibration/#calibration","text":"This example shows us how to use the calibration method to calibrate a classifier. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow .","title":"Calibration"},{"location":"examples/calibration/calibration/#load-the-data","text":"# Import packages import pandas as pd from atom import ATOMClassifier # Get the dataset's features and targets X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 40667 Williamtown 10.0 20.4 0.0 5.4 NaN NW 48.0 43490 Wollongong 15.0 22.0 0.4 NaN NaN SSW 59.0 102419 Nuriootpa 2.6 23.9 0.0 8.0 12.8 ESE 35.0 123437 SalmonGums 3.4 18.0 0.0 NaN NaN WSW 33.0 18121 NorahHead 16.5 22.3 0.0 NaN NaN S 46.0","title":"Load the data"},{"location":"examples/calibration/calibration/#run-the-pipeline","text":"# Initialize the ATOM class atom = ATOMClassifier(X, y='RainTomorrow', n_rows=1e4, verbose=1, warnings='ignore', random_state=1) # Handle missing values and categorical columns in the dataset atom.impute(strat_num='median', strat_cat='most_frequent') atom.encode(strategy='target', max_onehot=5, frac_to_other=0.05) # Fit a linear SVM to the data atom.run('lsvm') << ================== ATOM ================== >> Algorithm task: binary classification. Applying data cleaning... Dataset stats ================= >> Shape: (10000, 22) Missing values: 22613 Categorical columns: 5 Scaled: False ---------------------------------- Train set size: 8000 Test set size: 2000 Fitting Imputer... Imputing missing values... Fitting Encoder... Encoding categorical columns... Running pipeline ============================= >> Models in pipeline: lSVM Metric: f1 Results for Linear SVM: Fitting ----------------------------------------- Score on the train set --> f1: 0.5639 Score on the test set --> f1: 0.5929 Time elapsed: 0.444s ------------------------------------------------- Total time: 0.444s Final results ========================= >> Duration: 0.444s ------------------------------------------ Linear SVM --> f1: 0.593","title":"Run the pipeline"},{"location":"examples/calibration/calibration/#analyze-the-results","text":"# Check our model's calibration atom.plot_calibration() # Let's try to improve it using the calibrate method atom.calibrate(method='isotonic', cv=5) atom.plot_calibration()","title":"Analyze the results"},{"location":"examples/early_stopping/early_stopping/","text":"Early stopping This example shows how we can use early stopping to reduce the time it takes to run the pipeline. This option is only available for models that allow in-training evaluation (XGBoost, LightGBM and CatBoost). Import the breast cancer dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not. Load the data # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier # Get the dataset's features and targets X, y = load_breast_cancer(return_X_y=True) Run the pipeline # Start ATOM and fit the models using early stopping # An early stopping of 0.1 means that the model will stop if it # didn't improve in the last 10% of it's iterations. atom = ATOMClassifier(X, y, n_jobs=2, verbose=2, warnings=False, random_state=1) atom.run('LGB', metric='ap', n_calls=7, n_initial_points=3, bo_params={'early_stopping': 0.1, 'cv': 1}) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 2 cores. Applying data cleaning... Dataset stats ================= >> Shape: (569, 31) Scaled: False ---------------------------------- Train set size: 456 Test set size: 113 ---------------------------------- Train set balance: 0:1 <==> 0.6:1.0 Test set balance: 0:1 <==> 0.7:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 212 | 167 | 45 | | 1 | 357 | 289 | 68 | Running pipeline ============================= >> Models in pipeline: LGB Metric: average_precision Running BO for LightGBM... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 499, 'learning_rate': 0.73, 'max_depth': 2, 'num_leaves': 40, 'min_child_weight': 5, 'min_child_samples': 18, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 100.0, 'reg_lambda': 10.0} Early stop at iteration 50 of 499. Evaluation --> average_precision: 0.6304 Best average_precision: 0.6304 Time iteration: 0.031s Total time: 0.047s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 170, 'learning_rate': 0.11, 'max_depth': 5, 'num_leaves': 25, 'min_child_weight': 11, 'min_child_samples': 28, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_alpha': 100.0, 'reg_lambda': 10.0} Early stop at iteration 18 of 170. Evaluation --> average_precision: 0.6304 Best average_precision: 0.6304 Time iteration: 0.028s Total time: 0.075s Random start 3 ---------------------------------- Parameters --> {'n_estimators': 364, 'learning_rate': 0.4, 'max_depth': 2, 'num_leaves': 30, 'min_child_weight': 17, 'min_child_samples': 27, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_alpha': 0.0, 'reg_lambda': 1.0} Early stop at iteration 42 of 364. Evaluation --> average_precision: 0.9819 Best average_precision: 0.9819 Time iteration: 0.020s Total time: 0.099s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 238, 'learning_rate': 0.49, 'max_depth': 3, 'num_leaves': 29, 'min_child_weight': 18, 'min_child_samples': 25, 'subsample': 0.9, 'colsample_bytree': 0.4, 'reg_alpha': 0.0, 'reg_lambda': 10.0} Early stop at iteration 30 of 238. Evaluation --> average_precision: 0.9911 Best average_precision: 0.9911 Time iteration: 0.016s Total time: 1.343s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 31, 'learning_rate': 0.07, 'max_depth': 6, 'num_leaves': 21, 'min_child_weight': 18, 'min_child_samples': 28, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_alpha': 0.0, 'reg_lambda': 100.0} Evaluation --> average_precision: 0.9920 Best average_precision: 0.9920 Time iteration: 0.016s Total time: 1.762s Iteration 6 ------------------------------------- Parameters --> {'n_estimators': 20, 'learning_rate': 1.0, 'max_depth': 3, 'num_leaves': 40, 'min_child_weight': 20, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.3, 'reg_alpha': 0.0, 'reg_lambda': 100.0} Early stop at iteration 12 of 20. Evaluation --> average_precision: 0.9953 Best average_precision: 0.9953 Time iteration: 0.016s Total time: 2.178s Iteration 7 ------------------------------------- Parameters --> {'n_estimators': 69, 'learning_rate': 0.17, 'max_depth': 7, 'num_leaves': 26, 'min_child_weight': 17, 'min_child_samples': 14, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0} Early stop at iteration 22 of 69. Evaluation --> average_precision: 0.9978 Best average_precision: 0.9978 Time iteration: 0.016s Total time: 2.499s Results for LightGBM: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 69, 'learning_rate': 0.17, 'max_depth': 7, 'num_leaves': 26, 'min_child_weight': 17, 'min_child_samples': 14, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0} Best evaluation --> average_precision: 0.9978 Time elapsed: 2.912s Fitting ----------------------------------------- Early stop at iteration 27 of 69. Score on the train set --> average_precision: 0.9962 Score on the test set --> average_precision: 0.9712 Time elapsed: 0.016s ------------------------------------------------- Total time: 2.928s Final results ========================= >> Duration: 2.928s ------------------------------------------ LightGBM --> average_precision: 0.971 Analyze the results # For these models, we can plot the evaluation on the train and test set during training # Note that the metric is provided by the model's library, not ATOM! atom.lgb.plot_evals(title=\"LightGBM's evaluation curve\", figsize=(11, 9))","title":"Early stopping"},{"location":"examples/early_stopping/early_stopping/#early-stopping","text":"This example shows how we can use early stopping to reduce the time it takes to run the pipeline. This option is only available for models that allow in-training evaluation (XGBoost, LightGBM and CatBoost). Import the breast cancer dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.","title":"Early stopping"},{"location":"examples/early_stopping/early_stopping/#load-the-data","text":"# Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier # Get the dataset's features and targets X, y = load_breast_cancer(return_X_y=True)","title":"Load the data"},{"location":"examples/early_stopping/early_stopping/#run-the-pipeline","text":"# Start ATOM and fit the models using early stopping # An early stopping of 0.1 means that the model will stop if it # didn't improve in the last 10% of it's iterations. atom = ATOMClassifier(X, y, n_jobs=2, verbose=2, warnings=False, random_state=1) atom.run('LGB', metric='ap', n_calls=7, n_initial_points=3, bo_params={'early_stopping': 0.1, 'cv': 1}) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 2 cores. Applying data cleaning... Dataset stats ================= >> Shape: (569, 31) Scaled: False ---------------------------------- Train set size: 456 Test set size: 113 ---------------------------------- Train set balance: 0:1 <==> 0.6:1.0 Test set balance: 0:1 <==> 0.7:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 212 | 167 | 45 | | 1 | 357 | 289 | 68 | Running pipeline ============================= >> Models in pipeline: LGB Metric: average_precision Running BO for LightGBM... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 499, 'learning_rate': 0.73, 'max_depth': 2, 'num_leaves': 40, 'min_child_weight': 5, 'min_child_samples': 18, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 100.0, 'reg_lambda': 10.0} Early stop at iteration 50 of 499. Evaluation --> average_precision: 0.6304 Best average_precision: 0.6304 Time iteration: 0.031s Total time: 0.047s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 170, 'learning_rate': 0.11, 'max_depth': 5, 'num_leaves': 25, 'min_child_weight': 11, 'min_child_samples': 28, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_alpha': 100.0, 'reg_lambda': 10.0} Early stop at iteration 18 of 170. Evaluation --> average_precision: 0.6304 Best average_precision: 0.6304 Time iteration: 0.028s Total time: 0.075s Random start 3 ---------------------------------- Parameters --> {'n_estimators': 364, 'learning_rate': 0.4, 'max_depth': 2, 'num_leaves': 30, 'min_child_weight': 17, 'min_child_samples': 27, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_alpha': 0.0, 'reg_lambda': 1.0} Early stop at iteration 42 of 364. Evaluation --> average_precision: 0.9819 Best average_precision: 0.9819 Time iteration: 0.020s Total time: 0.099s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 238, 'learning_rate': 0.49, 'max_depth': 3, 'num_leaves': 29, 'min_child_weight': 18, 'min_child_samples': 25, 'subsample': 0.9, 'colsample_bytree': 0.4, 'reg_alpha': 0.0, 'reg_lambda': 10.0} Early stop at iteration 30 of 238. Evaluation --> average_precision: 0.9911 Best average_precision: 0.9911 Time iteration: 0.016s Total time: 1.343s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 31, 'learning_rate': 0.07, 'max_depth': 6, 'num_leaves': 21, 'min_child_weight': 18, 'min_child_samples': 28, 'subsample': 0.9, 'colsample_bytree': 0.5, 'reg_alpha': 0.0, 'reg_lambda': 100.0} Evaluation --> average_precision: 0.9920 Best average_precision: 0.9920 Time iteration: 0.016s Total time: 1.762s Iteration 6 ------------------------------------- Parameters --> {'n_estimators': 20, 'learning_rate': 1.0, 'max_depth': 3, 'num_leaves': 40, 'min_child_weight': 20, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.3, 'reg_alpha': 0.0, 'reg_lambda': 100.0} Early stop at iteration 12 of 20. Evaluation --> average_precision: 0.9953 Best average_precision: 0.9953 Time iteration: 0.016s Total time: 2.178s Iteration 7 ------------------------------------- Parameters --> {'n_estimators': 69, 'learning_rate': 0.17, 'max_depth': 7, 'num_leaves': 26, 'min_child_weight': 17, 'min_child_samples': 14, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0} Early stop at iteration 22 of 69. Evaluation --> average_precision: 0.9978 Best average_precision: 0.9978 Time iteration: 0.016s Total time: 2.499s Results for LightGBM: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 69, 'learning_rate': 0.17, 'max_depth': 7, 'num_leaves': 26, 'min_child_weight': 17, 'min_child_samples': 14, 'subsample': 0.5, 'colsample_bytree': 0.8, 'reg_alpha': 0.01, 'reg_lambda': 1.0} Best evaluation --> average_precision: 0.9978 Time elapsed: 2.912s Fitting ----------------------------------------- Early stop at iteration 27 of 69. Score on the train set --> average_precision: 0.9962 Score on the test set --> average_precision: 0.9712 Time elapsed: 0.016s ------------------------------------------------- Total time: 2.928s Final results ========================= >> Duration: 2.928s ------------------------------------------ LightGBM --> average_precision: 0.971","title":"Run the pipeline"},{"location":"examples/early_stopping/early_stopping/#analyze-the-results","text":"# For these models, we can plot the evaluation on the train and test set during training # Note that the metric is provided by the model's library, not ATOM! atom.lgb.plot_evals(title=\"LightGBM's evaluation curve\", figsize=(11, 9))","title":"Analyze the results"},{"location":"examples/feature_engineering/feature_engineering/","text":"Feature engineering This example shows how to use automated feature generation to improve your model's performance. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow . Load the data # Import packages import pandas as pd from atom import ATOMClassifier # Load data X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 36171 WaggaWagga 14.3 21.4 0.8 10.6 5.8 W 52.0 44425 Canberra 16.0 22.8 0.0 12.4 6.0 E 50.0 126238 Walpole 13.8 20.7 4.8 NaN NaN NW 33.0 54550 Ballarat 3.3 14.7 0.0 NaN NaN N 46.0 85638 Cairns 23.5 31.5 43.8 0.8 8.5 SSE 52.0 Run the pipeline # Initiate ATOM and apply data cleaning atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0, random_state=1) atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) atom.encode(max_onehot=10, frac_to_other=0.04) # Let's see how a LightGBM model performs without adding additional features atom.run('LGB', metric='auc') atom.scoring() is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead Results ===================== >> LightGBM --> roc_auc: 0.878 # What are the most important fetaures? atom.plot_feature_importance(show=10) Now let's create some new fetaures using Deep Feature Synthesis atom.verbose = 2 # Increase verbosity to see the output # Create 100 new features using DFS atom.feature_generation(strategy='dfs', n_features=100, operators=['add', 'sub', 'log', 'sqrt']) Fitting FeatureGenerator... Creating new features... --> 100 new features were added to the dataset. divide by zero encountered in log invalid value encountered in log # The warnings warn us that some operators created missing values! # We can see the columns with missing values using the missing attribute atom.missing # We can easily turn off warnings in the future atom.warnings = False # We can use the impute method again atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) Fitting Imputer... Imputing missing values... --> Imputing 577 missing values using the KNN imputer in feature LOG(Cloud9am). --> Dropping feature LOG(RainToday_other) for containing 8873 (99%) missing values. --> Imputing 148 missing values using the KNN imputer in feature LOG(Sunshine). --> Imputing 6 missing values using the KNN imputer in feature LOG(Temp9am). --> Imputing 33 missing values using the KNN imputer in feature LOG(WindSpeed3pm). # 100 new features may be to much... # Let's check for multicollinearity and use RFECV to reduce the number even further atom.feature_selection(strategy='RFECV', solver='lgb', n_features=30, scoring='auc', max_correlation=0.98) Fitting FeatureSelector... Performing feature selection ... --> Feature Location was removed due to low variance. Value 0.2077375946173255 repeated in 100% of the rows. --> Feature Cloud3pm + Humidity3pm was removed due to collinearity with another feature. --> Feature Cloud3pm + RainToday_No was removed due to collinearity with another feature. --> Feature Cloud3pm + WindDir9am was removed due to collinearity with another feature. --> Feature Cloud3pm - Location was removed due to collinearity with another feature. --> Feature Cloud3pm - RainToday_No was removed due to collinearity with another feature. --> Feature Cloud9am + WindGustDir was removed due to collinearity with another feature. --> Feature Evaporation + Location was removed due to collinearity with another feature. --> Feature Evaporation + WindGustDir was removed due to collinearity with another feature. --> Feature Evaporation - WindDir3pm was removed due to collinearity with another feature. --> Feature Humidity3pm - RainToday_No was removed due to collinearity with another feature. --> Feature Humidity3pm - Sunshine was removed due to collinearity with another feature. --> Feature Humidity9am + RainToday_Yes was removed due to collinearity with another feature. --> Feature Humidity9am - RainToday_No was removed due to collinearity with another feature. --> Feature Humidity9am - Sunshine was removed due to collinearity with another feature. --> Feature LOG(MaxTemp) was removed due to collinearity with another feature. --> Feature Location + MinTemp was removed due to collinearity with another feature. --> Feature Location + RainToday_No was removed due to collinearity with another feature. --> Feature Location + WindDir3pm was removed due to collinearity with another feature. --> Feature Location + WindGustDir was removed due to collinearity with another feature. --> Feature Location + WindSpeed3pm was removed due to collinearity with another feature. --> Feature Location - RainToday_Yes was removed due to collinearity with another feature. --> Feature MaxTemp + RainToday_No was removed due to collinearity with another feature. --> Feature MaxTemp + RainToday_Yes was removed due to collinearity with another feature. --> Feature MinTemp + WindGustDir was removed due to collinearity with another feature. --> Feature Pressure3pm + RainToday_other was removed due to collinearity with another feature. --> Feature Pressure3pm + Temp3pm was removed due to collinearity with another feature. --> Feature Pressure3pm - WindGustDir was removed due to collinearity with another feature. --> Feature Pressure9am - WindGustDir was removed due to collinearity with another feature. --> Feature RainToday_No + Temp9am was removed due to collinearity with another feature. --> Feature RainToday_No + WindGustDir was removed due to collinearity with another feature. --> Feature RainToday_No - WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_Yes + Temp9am was removed due to collinearity with another feature. --> Feature RainToday_Yes + WindDir3pm was removed due to collinearity with another feature. --> Feature RainToday_Yes + WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_Yes - WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_other - Temp9am was removed due to collinearity with another feature. --> Feature RainToday_other - WindGustSpeed was removed due to collinearity with another feature. --> Feature RainToday_other - WindSpeed9am was removed due to collinearity with another feature. --> Feature Rainfall + RainToday_No was removed due to collinearity with another feature. --> Feature Rainfall + WindDir9am was removed due to collinearity with another feature. --> Feature Rainfall - WindDir3pm was removed due to collinearity with another feature. --> Feature SQRT(Humidity3pm) was removed due to collinearity with another feature. --> Feature SQRT(Pressure9am) was removed due to collinearity with another feature. --> Feature Sunshine + WindDir9am was removed due to collinearity with another feature. --> Feature Temp3pm + WindDir9am was removed due to collinearity with another feature. --> Feature Temp3pm + WindGustDir was removed due to collinearity with another feature. --> Feature Temp3pm - WindDir3pm was removed due to collinearity with another feature. --> Feature Temp9am - WindDir9am was removed due to collinearity with another feature. --> Feature WindDir3pm - WindSpeed3pm was removed due to collinearity with another feature. --> Feature WindGustDir + WindGustSpeed was removed due to collinearity with another feature. --> Feature WindGustDir - WindSpeed9am was removed due to collinearity with another feature. --> The RFECV selected 64 features from the dataset. >>> Dropping feature RainToday_Yes (rank 3). >>> Dropping feature RainToday_No (rank 5). >>> Dropping feature Location - WindSpeed9am (rank 2). >>> Dropping feature SQRT(Cloud9am) (rank 7). >>> Dropping feature SQRT(Rainfall) (rank 6). >>> Dropping feature SQRT(WindSpeed9am) (rank 4). # The collinear attribute shows what features were removed due to multicollinearity atom.collinear .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } drop_feature correlated_feature correlation_value 0 Cloud3pm + Humidity3pm Humidity3pm 0.99578 1 Cloud3pm + RainToday_No Cloud3pm 0.98122 2 Cloud3pm + WindDir9am Cloud3pm, Cloud3pm + RainToday_No 0.99968, 0.98054 3 Cloud3pm - Location Cloud3pm, Cloud3pm + RainToday_No, Cloud3pm + ... 1.0, 0.98122, 0.99968 4 Cloud3pm - RainToday_No Cloud3pm, Cloud3pm + WindDir9am, Cloud3pm - Lo... 0.98405, 0.98408, 0.98405 5 Cloud9am + WindGustDir Cloud9am 0.99979 6 Evaporation + Location Evaporation 1.0 7 Evaporation + WindGustDir Evaporation, Evaporation + Location 0.9999, 0.9999 8 Evaporation - WindDir3pm Evaporation, Evaporation + Location, Evaporati... 0.9999, 0.9999, 0.99969 9 Humidity3pm - RainToday_No Humidity3pm, Cloud3pm + Humidity3pm 0.99983, 0.99572 10 Humidity3pm - Sunshine Humidity3pm, Cloud3pm + Humidity3pm, Humidity3... 0.99347, 0.99405, 0.9935 11 Humidity9am + RainToday_Yes Humidity9am 0.9998 12 Humidity9am - RainToday_No Humidity9am, Humidity9am + RainToday_Yes 0.9998, 0.99999 13 Humidity9am - Sunshine Humidity9am, Humidity9am + RainToday_Yes, Humi... 0.99165, 0.99183, 0.99184 14 LOG(MaxTemp) MaxTemp 0.98395 15 Location + MinTemp MinTemp 1.0 16 Location + RainToday_No RainToday_Yes, RainToday_No -0.98403, 1.0 17 Location + WindDir3pm WindDir3pm 1.0 18 Location + WindGustDir WindGustDir 1.0 19 Location + WindSpeed3pm WindSpeed3pm 1.0 20 Location - RainToday_Yes RainToday_Yes, RainToday_No, Location + RainTo... -1.0, 0.98403, 0.98403 21 MaxTemp + RainToday_No MaxTemp, LOG(MaxTemp) 0.99841, 0.9831 22 MaxTemp + RainToday_Yes MaxTemp, LOG(MaxTemp), MaxTemp + RainToday_No 0.99834, 0.98156, 0.99356 23 MinTemp + WindGustDir MinTemp, Location + MinTemp 0.99997, 0.99997 24 Pressure3pm + RainToday_other Pressure3pm 0.99995 25 Pressure3pm + Temp3pm MaxTemp + Pressure3pm 0.98005 26 Pressure3pm - WindGustDir Pressure3pm, Pressure3pm + RainToday_other 0.99998, 0.99992 27 Pressure9am - WindGustDir Pressure9am 0.99998 28 RainToday_No + Temp9am Temp9am 0.99797 29 RainToday_No + WindGustDir RainToday_No, Location + RainToday_No 0.9933, 0.9933 30 RainToday_No - WindDir9am RainToday_No, Location + RainToday_No 0.99169, 0.99169 31 RainToday_Yes + Temp9am Temp9am, RainToday_No + Temp9am, RainToday_No ... 0.99795, 0.99191, -0.99993 32 RainToday_Yes + WindDir3pm RainToday_Yes, Location - RainToday_Yes 0.99334, -0.99334 33 RainToday_Yes + WindDir9am RainToday_Yes, Location - RainToday_Yes, RainT... 0.99154, -0.99154, -0.9847, 0.98993 34 RainToday_Yes - WindDir9am RainToday_Yes, Location - RainToday_Yes 0.9911, -0.9911 35 RainToday_other - Temp9am Temp9am, RainToday_No + Temp9am, RainToday_No ... -0.99993, -0.998, 0.99775, -0.99792 36 RainToday_other - WindGustSpeed WindGustSpeed, Cloud9am - WindGustSpeed -0.99998, 0.98438 37 RainToday_other - WindSpeed9am WindSpeed9am, Location - WindSpeed9am -0.99997, 0.99997 38 Rainfall + RainToday_No Rainfall 0.99907 39 Rainfall + WindDir9am Rainfall, Rainfall + RainToday_No 0.99998, 0.99902 40 Rainfall - WindDir3pm Rainfall, Rainfall + RainToday_No, Rainfall + ... 0.99998, 0.99907, 0.99995 41 SQRT(Humidity3pm) Humidity3pm, Cloud3pm + Humidity3pm, Humidity3... 0.98722, 0.98193, 0.98674 42 SQRT(Pressure9am) Pressure9am, Pressure9am - WindGustDir 1.0, 0.99998 43 Sunshine + WindDir9am Sunshine, RainToday_other - Sunshine 0.99982, -0.99948 44 Temp3pm + WindDir9am Temp3pm 0.99997 45 Temp3pm + WindGustDir Temp3pm, Temp3pm + WindDir9am 0.99998, 0.99997 46 Temp3pm - WindDir3pm Temp3pm, Temp3pm + WindDir9am, Temp3pm + WindG... 0.99998, 0.99993, 0.99993 47 Temp9am - WindDir9am Temp9am, RainToday_No + Temp9am, RainToday_No ... 0.99996, 0.99798, -0.99783, 0.99787, -0.9999 48 WindDir3pm - WindSpeed3pm WindSpeed3pm, Location + WindSpeed3pm, Locatio... -0.99998, -0.99998, 0.99998 49 WindGustDir + WindGustSpeed WindGustSpeed, Cloud9am - WindGustSpeed, RainT... 0.99999, -0.9843, -0.99998 50 WindGustDir - WindSpeed9am WindSpeed9am, Location - WindSpeed9am, RainTod... -0.99999, 0.99999, 0.99995 # After applying RFECV, we can plot the score per number of features atom.plot_rfecv() # Let's see how the model performs now atom.run('LGB') Running pipeline ============================= >> Models in pipeline: LGB Metric: roc_auc Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> roc_auc: 0.9962 Score on the test set --> roc_auc: 0.8787 Time elapsed: 0.708s ------------------------------------------------- Total time: 0.722s Final results ========================= >> Duration: 0.723s ------------------------------------------ LightGBM --> roc_auc: 0.879 # Did the feature importance change? atom.plot_feature_importance(show=10) Lets try the same using Genetic Feature Generation atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0, warnings=False, random_state=1) atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) atom.encode(max_onehot=10, frac_to_other=0.04) # Change verbosity to print extended info atom.verbose = 2 # Create new features using Genetic Programming atom.feature_generation(strategy='genetic', n_features=20, generations=10, population=2000) Fitting FeatureGenerator... | Population Average | Best Individual | ---- ------------------------- ------------------------------------------ ---------- Gen Length Fitness Length Fitness OOB Fitness Time Left 0 3.17 0.127531 3 0.50405 N/A 9.52s 1 3.10 0.338627 5 0.536586 N/A 9.04s 2 3.50 0.443734 9 0.541692 N/A 7.65s 3 4.44 0.47684 7 0.54494 N/A 6.89s 4 6.25 0.512037 13 0.546193 N/A 5.76s 5 7.47 0.507736 9 0.550266 N/A 4.62s 6 7.73 0.500405 11 0.55324 N/A 3.56s 7 7.99 0.497944 11 0.553398 N/A 2.38s 8 9.29 0.494223 13 0.554965 N/A 1.29s 9 10.68 0.493684 11 0.553398 N/A 0.00s Creating new features... --> 5 new features were added to the dataset. # We can see the feature's fitness and description through the genetic_features attribute atom.genetic_features .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name description fitness 0 Feature 24 mul(sub(sub(sub(Humidity3pm, Sunshine), Sunshi... 0.542398 1 Feature 25 mul(sub(sub(Humidity3pm, Sunshine), Sunshine),... 0.542240 2 Feature 26 mul(sub(Humidity3pm, Sunshine), mul(sub(sub(Hu... 0.542240 3 Feature 27 mul(mul(sub(Humidity3pm, Sunshine), WindGustSp... 0.542240 4 Feature 28 mul(mul(sub(sub(Humidity3pm, Sunshine), Sunshi... 0.542240 # And fit the model again atom.run('LGB', metric='auc') Running pipeline ============================= >> Models in pipeline: LGB Metric: roc_auc Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> roc_auc: 0.9901 Score on the test set --> roc_auc: 0.8793 Time elapsed: 0.305s ------------------------------------------------- Total time: 0.313s Final results ========================= >> Duration: 0.314s ------------------------------------------ LightGBM --> roc_auc: 0.879 atom.plot_feature_importance(show=10) # We can check the feature importance with other plots as well atom.plot_permutation_importance(show=10) atom.dependence_plot()","title":"Feature engineering"},{"location":"examples/feature_engineering/feature_engineering/#feature-engineering","text":"This example shows how to use automated feature generation to improve your model's performance. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package . The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow .","title":"Feature engineering"},{"location":"examples/feature_engineering/feature_engineering/#load-the-data","text":"# Import packages import pandas as pd from atom import ATOMClassifier # Load data X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 36171 WaggaWagga 14.3 21.4 0.8 10.6 5.8 W 52.0 44425 Canberra 16.0 22.8 0.0 12.4 6.0 E 50.0 126238 Walpole 13.8 20.7 4.8 NaN NaN NW 33.0 54550 Ballarat 3.3 14.7 0.0 NaN NaN N 46.0 85638 Cairns 23.5 31.5 43.8 0.8 8.5 SSE 52.0","title":"Load the data"},{"location":"examples/feature_engineering/feature_engineering/#run-the-pipeline","text":"# Initiate ATOM and apply data cleaning atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0, random_state=1) atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) atom.encode(max_onehot=10, frac_to_other=0.04) # Let's see how a LightGBM model performs without adding additional features atom.run('LGB', metric='auc') atom.scoring() is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead Results ===================== >> LightGBM --> roc_auc: 0.878 # What are the most important fetaures? atom.plot_feature_importance(show=10) Now let's create some new fetaures using Deep Feature Synthesis atom.verbose = 2 # Increase verbosity to see the output # Create 100 new features using DFS atom.feature_generation(strategy='dfs', n_features=100, operators=['add', 'sub', 'log', 'sqrt']) Fitting FeatureGenerator... Creating new features... --> 100 new features were added to the dataset. divide by zero encountered in log invalid value encountered in log # The warnings warn us that some operators created missing values! # We can see the columns with missing values using the missing attribute atom.missing # We can easily turn off warnings in the future atom.warnings = False # We can use the impute method again atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) Fitting Imputer... Imputing missing values... --> Imputing 577 missing values using the KNN imputer in feature LOG(Cloud9am). --> Dropping feature LOG(RainToday_other) for containing 8873 (99%) missing values. --> Imputing 148 missing values using the KNN imputer in feature LOG(Sunshine). --> Imputing 6 missing values using the KNN imputer in feature LOG(Temp9am). --> Imputing 33 missing values using the KNN imputer in feature LOG(WindSpeed3pm). # 100 new features may be to much... # Let's check for multicollinearity and use RFECV to reduce the number even further atom.feature_selection(strategy='RFECV', solver='lgb', n_features=30, scoring='auc', max_correlation=0.98) Fitting FeatureSelector... Performing feature selection ... --> Feature Location was removed due to low variance. Value 0.2077375946173255 repeated in 100% of the rows. --> Feature Cloud3pm + Humidity3pm was removed due to collinearity with another feature. --> Feature Cloud3pm + RainToday_No was removed due to collinearity with another feature. --> Feature Cloud3pm + WindDir9am was removed due to collinearity with another feature. --> Feature Cloud3pm - Location was removed due to collinearity with another feature. --> Feature Cloud3pm - RainToday_No was removed due to collinearity with another feature. --> Feature Cloud9am + WindGustDir was removed due to collinearity with another feature. --> Feature Evaporation + Location was removed due to collinearity with another feature. --> Feature Evaporation + WindGustDir was removed due to collinearity with another feature. --> Feature Evaporation - WindDir3pm was removed due to collinearity with another feature. --> Feature Humidity3pm - RainToday_No was removed due to collinearity with another feature. --> Feature Humidity3pm - Sunshine was removed due to collinearity with another feature. --> Feature Humidity9am + RainToday_Yes was removed due to collinearity with another feature. --> Feature Humidity9am - RainToday_No was removed due to collinearity with another feature. --> Feature Humidity9am - Sunshine was removed due to collinearity with another feature. --> Feature LOG(MaxTemp) was removed due to collinearity with another feature. --> Feature Location + MinTemp was removed due to collinearity with another feature. --> Feature Location + RainToday_No was removed due to collinearity with another feature. --> Feature Location + WindDir3pm was removed due to collinearity with another feature. --> Feature Location + WindGustDir was removed due to collinearity with another feature. --> Feature Location + WindSpeed3pm was removed due to collinearity with another feature. --> Feature Location - RainToday_Yes was removed due to collinearity with another feature. --> Feature MaxTemp + RainToday_No was removed due to collinearity with another feature. --> Feature MaxTemp + RainToday_Yes was removed due to collinearity with another feature. --> Feature MinTemp + WindGustDir was removed due to collinearity with another feature. --> Feature Pressure3pm + RainToday_other was removed due to collinearity with another feature. --> Feature Pressure3pm + Temp3pm was removed due to collinearity with another feature. --> Feature Pressure3pm - WindGustDir was removed due to collinearity with another feature. --> Feature Pressure9am - WindGustDir was removed due to collinearity with another feature. --> Feature RainToday_No + Temp9am was removed due to collinearity with another feature. --> Feature RainToday_No + WindGustDir was removed due to collinearity with another feature. --> Feature RainToday_No - WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_Yes + Temp9am was removed due to collinearity with another feature. --> Feature RainToday_Yes + WindDir3pm was removed due to collinearity with another feature. --> Feature RainToday_Yes + WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_Yes - WindDir9am was removed due to collinearity with another feature. --> Feature RainToday_other - Temp9am was removed due to collinearity with another feature. --> Feature RainToday_other - WindGustSpeed was removed due to collinearity with another feature. --> Feature RainToday_other - WindSpeed9am was removed due to collinearity with another feature. --> Feature Rainfall + RainToday_No was removed due to collinearity with another feature. --> Feature Rainfall + WindDir9am was removed due to collinearity with another feature. --> Feature Rainfall - WindDir3pm was removed due to collinearity with another feature. --> Feature SQRT(Humidity3pm) was removed due to collinearity with another feature. --> Feature SQRT(Pressure9am) was removed due to collinearity with another feature. --> Feature Sunshine + WindDir9am was removed due to collinearity with another feature. --> Feature Temp3pm + WindDir9am was removed due to collinearity with another feature. --> Feature Temp3pm + WindGustDir was removed due to collinearity with another feature. --> Feature Temp3pm - WindDir3pm was removed due to collinearity with another feature. --> Feature Temp9am - WindDir9am was removed due to collinearity with another feature. --> Feature WindDir3pm - WindSpeed3pm was removed due to collinearity with another feature. --> Feature WindGustDir + WindGustSpeed was removed due to collinearity with another feature. --> Feature WindGustDir - WindSpeed9am was removed due to collinearity with another feature. --> The RFECV selected 64 features from the dataset. >>> Dropping feature RainToday_Yes (rank 3). >>> Dropping feature RainToday_No (rank 5). >>> Dropping feature Location - WindSpeed9am (rank 2). >>> Dropping feature SQRT(Cloud9am) (rank 7). >>> Dropping feature SQRT(Rainfall) (rank 6). >>> Dropping feature SQRT(WindSpeed9am) (rank 4). # The collinear attribute shows what features were removed due to multicollinearity atom.collinear .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } drop_feature correlated_feature correlation_value 0 Cloud3pm + Humidity3pm Humidity3pm 0.99578 1 Cloud3pm + RainToday_No Cloud3pm 0.98122 2 Cloud3pm + WindDir9am Cloud3pm, Cloud3pm + RainToday_No 0.99968, 0.98054 3 Cloud3pm - Location Cloud3pm, Cloud3pm + RainToday_No, Cloud3pm + ... 1.0, 0.98122, 0.99968 4 Cloud3pm - RainToday_No Cloud3pm, Cloud3pm + WindDir9am, Cloud3pm - Lo... 0.98405, 0.98408, 0.98405 5 Cloud9am + WindGustDir Cloud9am 0.99979 6 Evaporation + Location Evaporation 1.0 7 Evaporation + WindGustDir Evaporation, Evaporation + Location 0.9999, 0.9999 8 Evaporation - WindDir3pm Evaporation, Evaporation + Location, Evaporati... 0.9999, 0.9999, 0.99969 9 Humidity3pm - RainToday_No Humidity3pm, Cloud3pm + Humidity3pm 0.99983, 0.99572 10 Humidity3pm - Sunshine Humidity3pm, Cloud3pm + Humidity3pm, Humidity3... 0.99347, 0.99405, 0.9935 11 Humidity9am + RainToday_Yes Humidity9am 0.9998 12 Humidity9am - RainToday_No Humidity9am, Humidity9am + RainToday_Yes 0.9998, 0.99999 13 Humidity9am - Sunshine Humidity9am, Humidity9am + RainToday_Yes, Humi... 0.99165, 0.99183, 0.99184 14 LOG(MaxTemp) MaxTemp 0.98395 15 Location + MinTemp MinTemp 1.0 16 Location + RainToday_No RainToday_Yes, RainToday_No -0.98403, 1.0 17 Location + WindDir3pm WindDir3pm 1.0 18 Location + WindGustDir WindGustDir 1.0 19 Location + WindSpeed3pm WindSpeed3pm 1.0 20 Location - RainToday_Yes RainToday_Yes, RainToday_No, Location + RainTo... -1.0, 0.98403, 0.98403 21 MaxTemp + RainToday_No MaxTemp, LOG(MaxTemp) 0.99841, 0.9831 22 MaxTemp + RainToday_Yes MaxTemp, LOG(MaxTemp), MaxTemp + RainToday_No 0.99834, 0.98156, 0.99356 23 MinTemp + WindGustDir MinTemp, Location + MinTemp 0.99997, 0.99997 24 Pressure3pm + RainToday_other Pressure3pm 0.99995 25 Pressure3pm + Temp3pm MaxTemp + Pressure3pm 0.98005 26 Pressure3pm - WindGustDir Pressure3pm, Pressure3pm + RainToday_other 0.99998, 0.99992 27 Pressure9am - WindGustDir Pressure9am 0.99998 28 RainToday_No + Temp9am Temp9am 0.99797 29 RainToday_No + WindGustDir RainToday_No, Location + RainToday_No 0.9933, 0.9933 30 RainToday_No - WindDir9am RainToday_No, Location + RainToday_No 0.99169, 0.99169 31 RainToday_Yes + Temp9am Temp9am, RainToday_No + Temp9am, RainToday_No ... 0.99795, 0.99191, -0.99993 32 RainToday_Yes + WindDir3pm RainToday_Yes, Location - RainToday_Yes 0.99334, -0.99334 33 RainToday_Yes + WindDir9am RainToday_Yes, Location - RainToday_Yes, RainT... 0.99154, -0.99154, -0.9847, 0.98993 34 RainToday_Yes - WindDir9am RainToday_Yes, Location - RainToday_Yes 0.9911, -0.9911 35 RainToday_other - Temp9am Temp9am, RainToday_No + Temp9am, RainToday_No ... -0.99993, -0.998, 0.99775, -0.99792 36 RainToday_other - WindGustSpeed WindGustSpeed, Cloud9am - WindGustSpeed -0.99998, 0.98438 37 RainToday_other - WindSpeed9am WindSpeed9am, Location - WindSpeed9am -0.99997, 0.99997 38 Rainfall + RainToday_No Rainfall 0.99907 39 Rainfall + WindDir9am Rainfall, Rainfall + RainToday_No 0.99998, 0.99902 40 Rainfall - WindDir3pm Rainfall, Rainfall + RainToday_No, Rainfall + ... 0.99998, 0.99907, 0.99995 41 SQRT(Humidity3pm) Humidity3pm, Cloud3pm + Humidity3pm, Humidity3... 0.98722, 0.98193, 0.98674 42 SQRT(Pressure9am) Pressure9am, Pressure9am - WindGustDir 1.0, 0.99998 43 Sunshine + WindDir9am Sunshine, RainToday_other - Sunshine 0.99982, -0.99948 44 Temp3pm + WindDir9am Temp3pm 0.99997 45 Temp3pm + WindGustDir Temp3pm, Temp3pm + WindDir9am 0.99998, 0.99997 46 Temp3pm - WindDir3pm Temp3pm, Temp3pm + WindDir9am, Temp3pm + WindG... 0.99998, 0.99993, 0.99993 47 Temp9am - WindDir9am Temp9am, RainToday_No + Temp9am, RainToday_No ... 0.99996, 0.99798, -0.99783, 0.99787, -0.9999 48 WindDir3pm - WindSpeed3pm WindSpeed3pm, Location + WindSpeed3pm, Locatio... -0.99998, -0.99998, 0.99998 49 WindGustDir + WindGustSpeed WindGustSpeed, Cloud9am - WindGustSpeed, RainT... 0.99999, -0.9843, -0.99998 50 WindGustDir - WindSpeed9am WindSpeed9am, Location - WindSpeed9am, RainTod... -0.99999, 0.99999, 0.99995 # After applying RFECV, we can plot the score per number of features atom.plot_rfecv() # Let's see how the model performs now atom.run('LGB') Running pipeline ============================= >> Models in pipeline: LGB Metric: roc_auc Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> roc_auc: 0.9962 Score on the test set --> roc_auc: 0.8787 Time elapsed: 0.708s ------------------------------------------------- Total time: 0.722s Final results ========================= >> Duration: 0.723s ------------------------------------------ LightGBM --> roc_auc: 0.879 # Did the feature importance change? atom.plot_feature_importance(show=10) Lets try the same using Genetic Feature Generation atom = ATOMClassifier(X, n_rows=1e4, test_size=0.2, verbose=0, warnings=False, random_state=1) atom.impute(strat_num='knn', strat_cat='remove', min_frac_rows=0.8) atom.encode(max_onehot=10, frac_to_other=0.04) # Change verbosity to print extended info atom.verbose = 2 # Create new features using Genetic Programming atom.feature_generation(strategy='genetic', n_features=20, generations=10, population=2000) Fitting FeatureGenerator... | Population Average | Best Individual | ---- ------------------------- ------------------------------------------ ---------- Gen Length Fitness Length Fitness OOB Fitness Time Left 0 3.17 0.127531 3 0.50405 N/A 9.52s 1 3.10 0.338627 5 0.536586 N/A 9.04s 2 3.50 0.443734 9 0.541692 N/A 7.65s 3 4.44 0.47684 7 0.54494 N/A 6.89s 4 6.25 0.512037 13 0.546193 N/A 5.76s 5 7.47 0.507736 9 0.550266 N/A 4.62s 6 7.73 0.500405 11 0.55324 N/A 3.56s 7 7.99 0.497944 11 0.553398 N/A 2.38s 8 9.29 0.494223 13 0.554965 N/A 1.29s 9 10.68 0.493684 11 0.553398 N/A 0.00s Creating new features... --> 5 new features were added to the dataset. # We can see the feature's fitness and description through the genetic_features attribute atom.genetic_features .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name description fitness 0 Feature 24 mul(sub(sub(sub(Humidity3pm, Sunshine), Sunshi... 0.542398 1 Feature 25 mul(sub(sub(Humidity3pm, Sunshine), Sunshine),... 0.542240 2 Feature 26 mul(sub(Humidity3pm, Sunshine), mul(sub(sub(Hu... 0.542240 3 Feature 27 mul(mul(sub(Humidity3pm, Sunshine), WindGustSp... 0.542240 4 Feature 28 mul(mul(sub(sub(Humidity3pm, Sunshine), Sunshi... 0.542240 # And fit the model again atom.run('LGB', metric='auc') Running pipeline ============================= >> Models in pipeline: LGB Metric: roc_auc Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> roc_auc: 0.9901 Score on the test set --> roc_auc: 0.8793 Time elapsed: 0.305s ------------------------------------------------- Total time: 0.313s Final results ========================= >> Duration: 0.314s ------------------------------------------ LightGBM --> roc_auc: 0.879 atom.plot_feature_importance(show=10) # We can check the feature importance with other plots as well atom.plot_permutation_importance(show=10) atom.dependence_plot()","title":"Run the pipeline"},{"location":"examples/multi_metric/multi_metric/","text":"Multi-metric This example shows how we can evaluate an ATOM pipeline on multiple metrics. Import the breast cancer dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not. Load the data # Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier # Get the dataset's features and targets X, y = load_breast_cancer(return_X_y=True) Run the pipeline # Call ATOM and run the pipeline using multipe metrics # Note that for every step of the BO, both metrics are calculated, but only the first is used for optimization! atom = ATOMClassifier(X, y, n_jobs=2, verbose=2, warnings=False, random_state=1) atom.run(['MNB', 'QDA'], metric=('f1', 'recall'), n_calls=3, n_initial_points=1, bagging=4) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 2 cores. Applying data cleaning... Dataset stats ================= >> Shape: (569, 31) Scaled: False ---------------------------------- Train set size: 456 Test set size: 113 ---------------------------------- Train set balance: 0:1 <==> 0.6:1.0 Test set balance: 0:1 <==> 0.7:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 212 | 167 | 45 | | 1 | 357 | 289 | 68 | Running pipeline ============================= >> Models in pipeline: MNB, QDA Metric: f1, recall Running BO for Multinomial Naive Bayes... Random start 1 ---------------------------------- Parameters --> {'alpha': 1, 'fit_prior': True} Evaluation --> f1: 0.9260 Best f1: 0.9260 recall: 0.9722 Best recall: 0.9722 Time iteration: 3.108s Total time: 3.124s Iteration 2 ------------------------------------- Parameters --> {'alpha': 9.744, 'fit_prior': True} Evaluation --> f1: 0.9225 Best f1: 0.9260 recall: 0.9688 Best recall: 0.9722 Time iteration: 0.048s Total time: 3.172s Iteration 3 ------------------------------------- Parameters --> {'alpha': 0.66, 'fit_prior': False} Evaluation --> f1: 0.9223 Best f1: 0.9260 recall: 0.9655 Best recall: 0.9722 Time iteration: 0.044s Total time: 3.357s Results for Multinomial Naive Bayes: Bayesian Optimization --------------------------- Best parameters --> {'alpha': 1, 'fit_prior': True} Best evaluation --> f1: 0.9260 recall: 0.9722 Time elapsed: 3.494s Fitting ----------------------------------------- Score on the train set --> f1: 0.9243 recall: 0.9723 Score on the test set --> f1: 0.9103 recall: 0.9706 Time elapsed: 0.004s Bagging ----------------------------------------- Score --> f1: 0.9100 \u00b1 0.0005 recall: 0.9669 \u00b1 0.0064 Time elapsed: 0.031s ------------------------------------------------- Total time: 3.531s Running BO for Quadratic Discriminant Analysis... Random start 1 ---------------------------------- Parameters --> {'reg_param': 0} Evaluation --> f1: 0.9654 Best f1: 0.9654 recall: 0.9619 Best recall: 0.9619 Time iteration: 0.031s Total time: 0.031s Iteration 2 ------------------------------------- Parameters --> {'reg_param': 1.0} Evaluation --> f1: 0.9245 Best f1: 0.9654 recall: 0.9897 Best recall: 0.9897 Time iteration: 0.031s Total time: 0.063s Iteration 3 ------------------------------------- Parameters --> {'reg_param': 0.0} Evaluation --> f1: 0.9633 Best f1: 0.9654 recall: 0.9549 Best recall: 0.9897 Time iteration: 0.031s Total time: 0.188s Results for Quadratic Discriminant Analysis: Bayesian Optimization --------------------------- Best parameters --> {'reg_param': 0} Best evaluation --> f1: 0.9654 recall: 0.9619 Time elapsed: 0.297s Fitting ----------------------------------------- Score on the train set --> f1: 0.9828 recall: 0.9896 Score on the test set --> f1: 0.9710 recall: 0.9853 Time elapsed: 0.016s Bagging ----------------------------------------- Score --> f1: 0.9606 \u00b1 0.0081 recall: 0.9853 \u00b1 0.0104 Time elapsed: 0.031s ------------------------------------------------- Total time: 0.344s Final results ========================= >> Duration: 3.875s ------------------------------------------ Multinomial Naive Bayes --> f1: 0.910 \u00b1 0.001 recall: 0.967 \u00b1 0.006 Quadratic Discriminant Analysis --> f1: 0.961 \u00b1 0.008 recall: 0.985 \u00b1 0.010 ! Analyze the results # Note that some columns in the results dataframe now contain a list of scores, # one for each metric, in the same order as you called them atom.results[['metric_bo', 'metric_train', 'metric_test']] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } metric_bo metric_train metric_test model MNB [0.9259597646215939, 0.9722323049001815] [0.924342105263158, 0.972318339100346] [0.9103448275862068, 0.9705882352941176] QDA [0.965402611638704, 0.9618874773139746] [0.9828178694158075, 0.9896193771626297] [0.9710144927536232, 0.9852941176470589] # Some plots allow us to choose the metric we want to show atom.plot_bagging(metric='recall')","title":"Multi-metric"},{"location":"examples/multi_metric/multi_metric/#multi-metric","text":"This example shows how we can evaluate an ATOM pipeline on multiple metrics. Import the breast cancer dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict whether a patient has breast cancer or not.","title":"Multi-metric"},{"location":"examples/multi_metric/multi_metric/#load-the-data","text":"# Import packages from sklearn.datasets import load_breast_cancer from atom import ATOMClassifier # Get the dataset's features and targets X, y = load_breast_cancer(return_X_y=True)","title":"Load the data"},{"location":"examples/multi_metric/multi_metric/#run-the-pipeline","text":"# Call ATOM and run the pipeline using multipe metrics # Note that for every step of the BO, both metrics are calculated, but only the first is used for optimization! atom = ATOMClassifier(X, y, n_jobs=2, verbose=2, warnings=False, random_state=1) atom.run(['MNB', 'QDA'], metric=('f1', 'recall'), n_calls=3, n_initial_points=1, bagging=4) << ================== ATOM ================== >> Algorithm task: binary classification. Parallel processing with 2 cores. Applying data cleaning... Dataset stats ================= >> Shape: (569, 31) Scaled: False ---------------------------------- Train set size: 456 Test set size: 113 ---------------------------------- Train set balance: 0:1 <==> 0.6:1.0 Test set balance: 0:1 <==> 0.7:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 212 | 167 | 45 | | 1 | 357 | 289 | 68 | Running pipeline ============================= >> Models in pipeline: MNB, QDA Metric: f1, recall Running BO for Multinomial Naive Bayes... Random start 1 ---------------------------------- Parameters --> {'alpha': 1, 'fit_prior': True} Evaluation --> f1: 0.9260 Best f1: 0.9260 recall: 0.9722 Best recall: 0.9722 Time iteration: 3.108s Total time: 3.124s Iteration 2 ------------------------------------- Parameters --> {'alpha': 9.744, 'fit_prior': True} Evaluation --> f1: 0.9225 Best f1: 0.9260 recall: 0.9688 Best recall: 0.9722 Time iteration: 0.048s Total time: 3.172s Iteration 3 ------------------------------------- Parameters --> {'alpha': 0.66, 'fit_prior': False} Evaluation --> f1: 0.9223 Best f1: 0.9260 recall: 0.9655 Best recall: 0.9722 Time iteration: 0.044s Total time: 3.357s Results for Multinomial Naive Bayes: Bayesian Optimization --------------------------- Best parameters --> {'alpha': 1, 'fit_prior': True} Best evaluation --> f1: 0.9260 recall: 0.9722 Time elapsed: 3.494s Fitting ----------------------------------------- Score on the train set --> f1: 0.9243 recall: 0.9723 Score on the test set --> f1: 0.9103 recall: 0.9706 Time elapsed: 0.004s Bagging ----------------------------------------- Score --> f1: 0.9100 \u00b1 0.0005 recall: 0.9669 \u00b1 0.0064 Time elapsed: 0.031s ------------------------------------------------- Total time: 3.531s Running BO for Quadratic Discriminant Analysis... Random start 1 ---------------------------------- Parameters --> {'reg_param': 0} Evaluation --> f1: 0.9654 Best f1: 0.9654 recall: 0.9619 Best recall: 0.9619 Time iteration: 0.031s Total time: 0.031s Iteration 2 ------------------------------------- Parameters --> {'reg_param': 1.0} Evaluation --> f1: 0.9245 Best f1: 0.9654 recall: 0.9897 Best recall: 0.9897 Time iteration: 0.031s Total time: 0.063s Iteration 3 ------------------------------------- Parameters --> {'reg_param': 0.0} Evaluation --> f1: 0.9633 Best f1: 0.9654 recall: 0.9549 Best recall: 0.9897 Time iteration: 0.031s Total time: 0.188s Results for Quadratic Discriminant Analysis: Bayesian Optimization --------------------------- Best parameters --> {'reg_param': 0} Best evaluation --> f1: 0.9654 recall: 0.9619 Time elapsed: 0.297s Fitting ----------------------------------------- Score on the train set --> f1: 0.9828 recall: 0.9896 Score on the test set --> f1: 0.9710 recall: 0.9853 Time elapsed: 0.016s Bagging ----------------------------------------- Score --> f1: 0.9606 \u00b1 0.0081 recall: 0.9853 \u00b1 0.0104 Time elapsed: 0.031s ------------------------------------------------- Total time: 0.344s Final results ========================= >> Duration: 3.875s ------------------------------------------ Multinomial Naive Bayes --> f1: 0.910 \u00b1 0.001 recall: 0.967 \u00b1 0.006 Quadratic Discriminant Analysis --> f1: 0.961 \u00b1 0.008 recall: 0.985 \u00b1 0.010 !","title":"Run the pipeline"},{"location":"examples/multi_metric/multi_metric/#analyze-the-results","text":"# Note that some columns in the results dataframe now contain a list of scores, # one for each metric, in the same order as you called them atom.results[['metric_bo', 'metric_train', 'metric_test']] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } metric_bo metric_train metric_test model MNB [0.9259597646215939, 0.9722323049001815] [0.924342105263158, 0.972318339100346] [0.9103448275862068, 0.9705882352941176] QDA [0.965402611638704, 0.9618874773139746] [0.9828178694158075, 0.9896193771626297] [0.9710144927536232, 0.9852941176470589] # Some plots allow us to choose the metric we want to show atom.plot_bagging(metric='recall')","title":"Analyze the results"},{"location":"examples/multiclass_classification/multiclass_classification/","text":"Multiclass classification This example shows how to compare the performance of three models on a multiclass classification task. Import the wine dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict wines into three groups (which cultivator it's from) using features based on the results of chemical analysis. Load the data # Import packages from sklearn.datasets import load_wine from atom import ATOMClassifier # Load the dataset's features and targets X, y = load_wine(return_X_y=True, as_frame=True) # Let's have a look at a subsample of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols 101 12.60 1.34 1.90 18.5 88.0 1.45 1.36 0.29 133 12.70 3.55 2.36 21.5 106.0 1.70 1.20 0.17 86 12.16 1.61 2.31 22.8 90.0 1.78 1.69 0.43 93 12.29 2.83 2.22 18.0 88.0 2.45 2.25 0.25 92 12.69 1.53 2.26 20.7 80.0 1.38 1.46 0.58 Run the pipeline atom = ATOMClassifier(X, y, n_jobs=-1, warnings='ignore', verbose=2, random_state=1) # Fit the pipeline with the selected models atom.run(models=['LR','LDA', 'RF'], metric='roc_auc_ovr', n_calls=4, n_initial_points=3, bo_params={'base_estimator': 'rf', 'max_time': 100}, bagging=5) << ================== ATOM ================== >> Algorithm task: multiclass classification. Parallel processing with 16 cores. Applying data cleaning... Dataset stats ================= >> Shape: (178, 14) Scaled: False ---------------------------------- Train set size: 143 Test set size: 35 ---------------------------------- Train set balance: 0:1:2 <==> 1.4:1.7:1.0 Test set balance: 0:1:2 <==> 0.7:1.0:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 59 | 50 | 9 | | 1 | 71 | 58 | 13 | | 2 | 48 | 35 | 13 | Running pipeline ============================= >> Models in pipeline: LR, LDA, RF Metric: roc_auc_ovr Running BO for Logistic Regression... Random start 1 ---------------------------------- Parameters --> {'max_iter': 335, 'solver': 'sag', 'penalty': 'l2', 'C': 0.001} Evaluation --> roc_auc_ovr: 0.9970 Best roc_auc_ovr: 0.9970 Time iteration: 3.971s Total time: 3.975s Random start 2 ---------------------------------- Parameters --> {'max_iter': 244, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.087} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 3.769s Total time: 7.748s Random start 3 ---------------------------------- Parameters --> {'max_iter': 376, 'solver': 'liblinear', 'penalty': 'l2', 'C': 2.667} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 3.589s Total time: 11.342s Iteration 4 ------------------------------------- Parameters --> {'max_iter': 498, 'solver': 'sag', 'penalty': 'l2', 'C': 0.882} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 4.328s Total time: 15.920s Results for Logistic Regression: Bayesian Optimization --------------------------- Best parameters --> {'max_iter': 244, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.087} Best evaluation --> roc_auc_ovr: 1.0000 Time elapsed: 16.151s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 1.0000 Score on the test set --> roc_auc_ovr: 0.9988 Time elapsed: 0.020s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9991 \u00b1 0.0009 Time elapsed: 0.072s ------------------------------------------------- Total time: 16.249s Running BO for Linear Discriminant Analysis... Random start 1 ---------------------------------- Parameters --> {'solver': 'eigen', 'shrinkage': 1.0} Evaluation --> roc_auc_ovr: 0.8975 Best roc_auc_ovr: 0.8975 Time iteration: 0.021s Total time: 0.022s Random start 2 ---------------------------------- Parameters --> {'solver': 'svd'} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 0.021s Total time: 0.047s Random start 3 ---------------------------------- Parameters --> {'solver': 'svd'} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 0.018s Total time: 0.068s Iteration 4 ------------------------------------- Parameters --> {'solver': 'lsqr', 'shrinkage': 0.7} Evaluation --> roc_auc_ovr: 0.8996 Best roc_auc_ovr: 1.0000 Time iteration: 0.020s Total time: 0.279s Results for Linear Discriminant Analysis: Bayesian Optimization --------------------------- Best parameters --> {'solver': 'svd'} Best evaluation --> roc_auc_ovr: 1.0000 Time elapsed: 0.474s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 1.0000 Score on the test set --> roc_auc_ovr: 1.0000 Time elapsed: 0.010s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9998 \u00b1 0.0005 Time elapsed: 0.024s ------------------------------------------------- Total time: 0.510s Running BO for Random Forest... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 245, 'max_depth': 7, 'max_features': 1.0, 'criterion': 'gini', 'min_samples_split': 7, 'min_samples_leaf': 16, 'ccp_alpha': 0.008, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> roc_auc_ovr: 0.9853 Best roc_auc_ovr: 0.9853 Time iteration: 0.412s Total time: 0.418s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 400, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'gini', 'min_samples_split': 20, 'min_samples_leaf': 12, 'ccp_alpha': 0.016, 'bootstrap': True, 'max_samples': 0.7} Evaluation --> roc_auc_ovr: 0.9937 Best roc_auc_ovr: 0.9937 Time iteration: 0.642s Total time: 1.063s Random start 3 ---------------------------------- Parameters --> {'n_estimators': 78, 'max_depth': 10, 'max_features': 0.7, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 14, 'ccp_alpha': 0.025, 'bootstrap': False} Evaluation --> roc_auc_ovr: 0.9865 Best roc_auc_ovr: 0.9937 Time iteration: 0.122s Total time: 1.190s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 323, 'max_depth': 7, 'max_features': 1.0, 'criterion': 'gini', 'min_samples_split': 16, 'min_samples_leaf': 1, 'ccp_alpha': 0.007, 'bootstrap': False} Evaluation --> roc_auc_ovr: 0.9315 Best roc_auc_ovr: 0.9937 Time iteration: 0.405s Total time: 1.823s Results for Random Forest: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 400, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'gini', 'min_samples_split': 20, 'min_samples_leaf': 12, 'ccp_alpha': 0.016, 'bootstrap': True, 'max_samples': 0.7} Best evaluation --> roc_auc_ovr: 0.9937 Time elapsed: 2.056s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 0.9997 Score on the test set --> roc_auc_ovr: 0.9825 Time elapsed: 0.588s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9737 \u00b1 0.0116 Time elapsed: 2.716s ------------------------------------------------- Total time: 5.363s Final results ========================= >> Duration: 22.125s ------------------------------------------ Logistic Regression --> roc_auc_ovr: 0.999 \u00b1 0.001 Linear Discriminant Analysis --> roc_auc_ovr: 1.000 \u00b1 0.000 ! Random Forest --> roc_auc_ovr: 0.974 \u00b1 0.012 Analyze the results # We can access the pipeline's results via the results attribute atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name metric_bo time_bo metric_train metric_test time_fit mean_bagging std_bagging time_bagging time model LR Logistic Regression 1.000000 16.151s 1.000000 0.998834 0.020s 0.999068 0.000872 0.072s 16.249s LDA Linear Discriminant Analysis 1.000000 0.474s 1.000000 1.000000 0.010s 0.999767 0.000466 0.024s 0.510s RF Random Forest 0.993712 2.056s 0.999725 0.982517 0.588s 0.973686 0.011577 2.716s 5.363s # Show the scoring for a different metric than the one we trained on atom.scoring('precision_macro') Results ===================== >> Logistic Regression --> precision_macro: 1.0 Linear Discriminant Analysis --> precision_macro: 0.976 Random Forest --> precision_macro: 0.9 Let's have a closer look at the Random Forest # Get the results on some other metrics print('Jaccard score:', atom.rf.scoring('jaccard_weighted')) print('Recall score:', atom.rf.scoring('recall_macro')) Jaccard score: 0.7957142857142857 Recall score: 0.8974358974358975 # Plot the confusion matrix atom.RF.plot_confusion_matrix(figsize=(9, 9)) # Save the estimator as a pickle file atom.RF.save_estimator('Random_Forest_model') Random Forest estimator saved successfully!","title":"Multiclass_classification"},{"location":"examples/multiclass_classification/multiclass_classification/#multiclass-classification","text":"This example shows how to compare the performance of three models on a multiclass classification task. Import the wine dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict wines into three groups (which cultivator it's from) using features based on the results of chemical analysis.","title":"Multiclass classification"},{"location":"examples/multiclass_classification/multiclass_classification/#load-the-data","text":"# Import packages from sklearn.datasets import load_wine from atom import ATOMClassifier # Load the dataset's features and targets X, y = load_wine(return_X_y=True, as_frame=True) # Let's have a look at a subsample of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols flavanoids nonflavanoid_phenols 101 12.60 1.34 1.90 18.5 88.0 1.45 1.36 0.29 133 12.70 3.55 2.36 21.5 106.0 1.70 1.20 0.17 86 12.16 1.61 2.31 22.8 90.0 1.78 1.69 0.43 93 12.29 2.83 2.22 18.0 88.0 2.45 2.25 0.25 92 12.69 1.53 2.26 20.7 80.0 1.38 1.46 0.58","title":"Load the data"},{"location":"examples/multiclass_classification/multiclass_classification/#run-the-pipeline","text":"atom = ATOMClassifier(X, y, n_jobs=-1, warnings='ignore', verbose=2, random_state=1) # Fit the pipeline with the selected models atom.run(models=['LR','LDA', 'RF'], metric='roc_auc_ovr', n_calls=4, n_initial_points=3, bo_params={'base_estimator': 'rf', 'max_time': 100}, bagging=5) << ================== ATOM ================== >> Algorithm task: multiclass classification. Parallel processing with 16 cores. Applying data cleaning... Dataset stats ================= >> Shape: (178, 14) Scaled: False ---------------------------------- Train set size: 143 Test set size: 35 ---------------------------------- Train set balance: 0:1:2 <==> 1.4:1.7:1.0 Test set balance: 0:1:2 <==> 0.7:1.0:1.0 ---------------------------------- Instances in target per class: | | total | train_set | test_set | |---:|---------:|-------------:|------------:| | 0 | 59 | 50 | 9 | | 1 | 71 | 58 | 13 | | 2 | 48 | 35 | 13 | Running pipeline ============================= >> Models in pipeline: LR, LDA, RF Metric: roc_auc_ovr Running BO for Logistic Regression... Random start 1 ---------------------------------- Parameters --> {'max_iter': 335, 'solver': 'sag', 'penalty': 'l2', 'C': 0.001} Evaluation --> roc_auc_ovr: 0.9970 Best roc_auc_ovr: 0.9970 Time iteration: 3.971s Total time: 3.975s Random start 2 ---------------------------------- Parameters --> {'max_iter': 244, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.087} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 3.769s Total time: 7.748s Random start 3 ---------------------------------- Parameters --> {'max_iter': 376, 'solver': 'liblinear', 'penalty': 'l2', 'C': 2.667} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 3.589s Total time: 11.342s Iteration 4 ------------------------------------- Parameters --> {'max_iter': 498, 'solver': 'sag', 'penalty': 'l2', 'C': 0.882} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 4.328s Total time: 15.920s Results for Logistic Regression: Bayesian Optimization --------------------------- Best parameters --> {'max_iter': 244, 'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.087} Best evaluation --> roc_auc_ovr: 1.0000 Time elapsed: 16.151s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 1.0000 Score on the test set --> roc_auc_ovr: 0.9988 Time elapsed: 0.020s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9991 \u00b1 0.0009 Time elapsed: 0.072s ------------------------------------------------- Total time: 16.249s Running BO for Linear Discriminant Analysis... Random start 1 ---------------------------------- Parameters --> {'solver': 'eigen', 'shrinkage': 1.0} Evaluation --> roc_auc_ovr: 0.8975 Best roc_auc_ovr: 0.8975 Time iteration: 0.021s Total time: 0.022s Random start 2 ---------------------------------- Parameters --> {'solver': 'svd'} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 0.021s Total time: 0.047s Random start 3 ---------------------------------- Parameters --> {'solver': 'svd'} Evaluation --> roc_auc_ovr: 1.0000 Best roc_auc_ovr: 1.0000 Time iteration: 0.018s Total time: 0.068s Iteration 4 ------------------------------------- Parameters --> {'solver': 'lsqr', 'shrinkage': 0.7} Evaluation --> roc_auc_ovr: 0.8996 Best roc_auc_ovr: 1.0000 Time iteration: 0.020s Total time: 0.279s Results for Linear Discriminant Analysis: Bayesian Optimization --------------------------- Best parameters --> {'solver': 'svd'} Best evaluation --> roc_auc_ovr: 1.0000 Time elapsed: 0.474s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 1.0000 Score on the test set --> roc_auc_ovr: 1.0000 Time elapsed: 0.010s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9998 \u00b1 0.0005 Time elapsed: 0.024s ------------------------------------------------- Total time: 0.510s Running BO for Random Forest... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 245, 'max_depth': 7, 'max_features': 1.0, 'criterion': 'gini', 'min_samples_split': 7, 'min_samples_leaf': 16, 'ccp_alpha': 0.008, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> roc_auc_ovr: 0.9853 Best roc_auc_ovr: 0.9853 Time iteration: 0.412s Total time: 0.418s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 400, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'gini', 'min_samples_split': 20, 'min_samples_leaf': 12, 'ccp_alpha': 0.016, 'bootstrap': True, 'max_samples': 0.7} Evaluation --> roc_auc_ovr: 0.9937 Best roc_auc_ovr: 0.9937 Time iteration: 0.642s Total time: 1.063s Random start 3 ---------------------------------- Parameters --> {'n_estimators': 78, 'max_depth': 10, 'max_features': 0.7, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 14, 'ccp_alpha': 0.025, 'bootstrap': False} Evaluation --> roc_auc_ovr: 0.9865 Best roc_auc_ovr: 0.9937 Time iteration: 0.122s Total time: 1.190s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 323, 'max_depth': 7, 'max_features': 1.0, 'criterion': 'gini', 'min_samples_split': 16, 'min_samples_leaf': 1, 'ccp_alpha': 0.007, 'bootstrap': False} Evaluation --> roc_auc_ovr: 0.9315 Best roc_auc_ovr: 0.9937 Time iteration: 0.405s Total time: 1.823s Results for Random Forest: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 400, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'gini', 'min_samples_split': 20, 'min_samples_leaf': 12, 'ccp_alpha': 0.016, 'bootstrap': True, 'max_samples': 0.7} Best evaluation --> roc_auc_ovr: 0.9937 Time elapsed: 2.056s Fitting ----------------------------------------- Score on the train set --> roc_auc_ovr: 0.9997 Score on the test set --> roc_auc_ovr: 0.9825 Time elapsed: 0.588s Bagging ----------------------------------------- Score --> roc_auc_ovr: 0.9737 \u00b1 0.0116 Time elapsed: 2.716s ------------------------------------------------- Total time: 5.363s Final results ========================= >> Duration: 22.125s ------------------------------------------ Logistic Regression --> roc_auc_ovr: 0.999 \u00b1 0.001 Linear Discriminant Analysis --> roc_auc_ovr: 1.000 \u00b1 0.000 ! Random Forest --> roc_auc_ovr: 0.974 \u00b1 0.012","title":"Run the pipeline"},{"location":"examples/multiclass_classification/multiclass_classification/#analyze-the-results","text":"# We can access the pipeline's results via the results attribute atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name metric_bo time_bo metric_train metric_test time_fit mean_bagging std_bagging time_bagging time model LR Logistic Regression 1.000000 16.151s 1.000000 0.998834 0.020s 0.999068 0.000872 0.072s 16.249s LDA Linear Discriminant Analysis 1.000000 0.474s 1.000000 1.000000 0.010s 0.999767 0.000466 0.024s 0.510s RF Random Forest 0.993712 2.056s 0.999725 0.982517 0.588s 0.973686 0.011577 2.716s 5.363s # Show the scoring for a different metric than the one we trained on atom.scoring('precision_macro') Results ===================== >> Logistic Regression --> precision_macro: 1.0 Linear Discriminant Analysis --> precision_macro: 0.976 Random Forest --> precision_macro: 0.9 Let's have a closer look at the Random Forest # Get the results on some other metrics print('Jaccard score:', atom.rf.scoring('jaccard_weighted')) print('Recall score:', atom.rf.scoring('recall_macro')) Jaccard score: 0.7957142857142857 Recall score: 0.8974358974358975 # Plot the confusion matrix atom.RF.plot_confusion_matrix(figsize=(9, 9)) # Save the estimator as a pickle file atom.RF.save_estimator('Random_Forest_model') Random Forest estimator saved successfully!","title":"Analyze the results"},{"location":"examples/regression/regression/","text":"Regression This example shows how to use ATOM to apply PCA on the data and run a regression pipeline. Download the abalone dataset from https://archive.ics.uci.edu/ml/datasets/Abalone . The goal of this dataset is to predict the rings (age) of abalone shells from physical measurements. Load the data # Import packages import pandas as pd from atom import ATOMRegressor # Load the abalone dataset X = pd.read_csv('./datasets/abalone.csv') # Let's have a look at the data X.head() .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 # Initialize ATOM for regression tasks and encode the categorical features atom = ATOMRegressor(X, y=\"Rings\", verbose=2, random_state=42) atom.encode() << ================== ATOM ================== >> Algorithm task: regression. Applying data cleaning... Dataset stats ================= >> Shape: (4177, 9) Categorical columns: 1 Scaled: False ---------------------------------- Train set size: 3342 Test set size: 835 Fitting Encoder... Encoding categorical columns... --> OneHot-encoding feature Sex. Contains 3 unique categories. is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead # Plot the dataset's correlation matrix atom.plot_correlation() # Apply PCA for dimensionality reduction atom.feature_selection(strategy=\"pca\", n_features=6) Fitting FeatureSelector... Performing feature selection ... --> Applying Principal Component Analysis... >>> Scaling features... >>> Total explained variance: 0.976 # Use the plotting methods to see the retained variance ratio atom.plot_pca() atom.plot_components(figsize=(8, 6), filename='atom_PCA_plot') Run the pipeline atom.run(['Tree', 'Bag', 'ET'], metric='MSE', n_calls=5, n_initial_points=2, bo_params={'base_estimator': 'GBRT', 'cv': 1}, bagging=5) Running pipeline ============================= >> Models in pipeline: Tree, Bag, ET Metric: neg_mean_squared_error Running BO for Decision Tree... Random start 1 ---------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'random', 'max_depth': 5, 'max_features': 0.9, 'min_samples_split': 8, 'min_samples_leaf': 19, 'ccp_alpha': 0.003} Evaluation --> neg_mean_squared_error: -7.8759 Best neg_mean_squared_error: -7.8759 Time iteration: 0.043s Total time: 0.048s Random start 2 ---------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'best', 'max_depth': 10, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.033} Evaluation --> neg_mean_squared_error: -9.1854 Best neg_mean_squared_error: -7.8759 Time iteration: 0.181s Total time: 0.233s Iteration 3 ------------------------------------- Parameters --> {'criterion': 'friedman_mse', 'splitter': 'random', 'max_depth': 7, 'max_features': 0.6, 'min_samples_split': 17, 'min_samples_leaf': 19, 'ccp_alpha': 0.015} Evaluation --> neg_mean_squared_error: -8.2130 Best neg_mean_squared_error: -7.8759 Time iteration: 0.007s Total time: 0.428s Iteration 4 ------------------------------------- Parameters --> {'criterion': 'friedman_mse', 'splitter': 'best', 'max_depth': 4, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.006} Evaluation --> neg_mean_squared_error: -6.7540 Best neg_mean_squared_error: -6.7540 Time iteration: 0.010s Total time: 0.533s Iteration 5 ------------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'best', 'max_depth': 3, 'max_features': 0.9, 'min_samples_split': 7, 'min_samples_leaf': 6, 'ccp_alpha': 0.007} Evaluation --> neg_mean_squared_error: -7.2855 Best neg_mean_squared_error: -6.7540 Time iteration: 0.132s Total time: 0.757s Results for Decision Tree: Bayesian Optimization --------------------------- Best parameters --> {'criterion': 'friedman_mse', 'splitter': 'best', 'max_depth': 4, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.006} Best evaluation --> neg_mean_squared_error: -6.7540 Time elapsed: 0.855s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -6.3636 Score on the test set --> neg_mean_squared_error: -5.4433 Time elapsed: 0.011s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -5.5541 \u00b1 0.1150 Time elapsed: 0.039s ------------------------------------------------- Total time: 0.910s Running BO for Bagging Regressor... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 112, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -5.7680 Best neg_mean_squared_error: -5.7680 Time iteration: 0.877s Total time: 0.881s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 131, 'max_samples': 0.5, 'max_features': 0.5, 'bootstrap': False, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -6.8254 Best neg_mean_squared_error: -5.7680 Time iteration: 0.585s Total time: 1.471s Iteration 3 ------------------------------------- Parameters --> {'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': True} Evaluation --> neg_mean_squared_error: -5.4895 Best neg_mean_squared_error: -5.4895 Time iteration: 0.389s Total time: 1.953s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 74, 'max_samples': 0.5, 'max_features': 0.5, 'bootstrap': False, 'bootstrap_features': True} Evaluation --> neg_mean_squared_error: -6.0363 Best neg_mean_squared_error: -5.4895 Time iteration: 0.330s Total time: 2.381s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 36, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': True, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -6.0037 Best neg_mean_squared_error: -5.4895 Time iteration: 0.194s Total time: 2.668s Results for Bagging Regressor: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': True} Best evaluation --> neg_mean_squared_error: -5.4895 Time elapsed: 2.764s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -0.0867 Score on the test set --> neg_mean_squared_error: -4.9533 Time elapsed: 0.571s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -5.2363 \u00b1 0.1099 Time elapsed: 2.325s ------------------------------------------------- Total time: 5.662s Running BO for Extra-Trees... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 112, 'max_depth': 6, 'max_features': 1.0, 'criterion': 'mae', 'min_samples_split': 8, 'min_samples_leaf': 19, 'ccp_alpha': 0.003, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.1995 Best neg_mean_squared_error: -7.1995 Time iteration: 1.034s Total time: 1.040s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 369, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 13, 'min_samples_leaf': 6, 'ccp_alpha': 0.0, 'bootstrap': False} Evaluation --> neg_mean_squared_error: -6.9525 Best neg_mean_squared_error: -6.9525 Time iteration: 0.495s Total time: 1.538s Iteration 3 ------------------------------------- Parameters --> {'n_estimators': 481, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 7, 'min_samples_leaf': 2, 'ccp_alpha': 0.001, 'bootstrap': False} Evaluation --> neg_mean_squared_error: -5.0279 Best neg_mean_squared_error: -5.0279 Time iteration: 0.744s Total time: 2.388s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 460, 'max_depth': 5, 'max_features': 1.0, 'criterion': 'mae', 'min_samples_split': 5, 'min_samples_leaf': 4, 'ccp_alpha': 0.034, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.3319 Best neg_mean_squared_error: -5.0279 Time iteration: 5.020s Total time: 7.517s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 474, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'mae', 'min_samples_split': 20, 'min_samples_leaf': 1, 'ccp_alpha': 0.018, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.5183 Best neg_mean_squared_error: -5.0279 Time iteration: 4.067s Total time: 11.690s Results for Extra-Trees: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 481, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 7, 'min_samples_leaf': 2, 'ccp_alpha': 0.001, 'bootstrap': False} Best evaluation --> neg_mean_squared_error: -5.0279 Time elapsed: 11.801s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -4.5366 Score on the test set --> neg_mean_squared_error: -4.4905 Time elapsed: 0.968s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -4.5803 \u00b1 0.0691 Time elapsed: 4.259s ------------------------------------------------- Total time: 17.032s Final results ========================= >> Duration: 23.606s ------------------------------------------ Decision Tree --> neg_mean_squared_error: -5.554 \u00b1 0.115 ~ Bagging Regressor --> neg_mean_squared_error: -5.236 \u00b1 0.110 ~ Extra-Trees --> neg_mean_squared_error: -4.580 \u00b1 0.069 ~ ! Analyze the results # For regression tasks, use the errors or residuals plots to check the model performances atom.plot_residuals() # Use the partial dependence plot to analyze the relation between the target response and the features atom.n_jobs = 8 # The method can be slow... atom.ET.plot_partial_dependence(features=(0, 1, (2, 3)), figsize=(12, 8))","title":"Regression"},{"location":"examples/regression/regression/#regression","text":"This example shows how to use ATOM to apply PCA on the data and run a regression pipeline. Download the abalone dataset from https://archive.ics.uci.edu/ml/datasets/Abalone . The goal of this dataset is to predict the rings (age) of abalone shells from physical measurements.","title":"Regression"},{"location":"examples/regression/regression/#load-the-data","text":"# Import packages import pandas as pd from atom import ATOMRegressor # Load the abalone dataset X = pd.read_csv('./datasets/abalone.csv') # Let's have a look at the data X.head() .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Sex Length Diameter Height Whole weight Shucked weight Viscera weight Shell weight Rings 0 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15 1 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7 2 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9 3 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10 4 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7 # Initialize ATOM for regression tasks and encode the categorical features atom = ATOMRegressor(X, y=\"Rings\", verbose=2, random_state=42) atom.encode() << ================== ATOM ================== >> Algorithm task: regression. Applying data cleaning... Dataset stats ================= >> Shape: (4177, 9) Categorical columns: 1 Scaled: False ---------------------------------- Train set size: 3342 Test set size: 835 Fitting Encoder... Encoding categorical columns... --> OneHot-encoding feature Sex. Contains 3 unique categories. is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead # Plot the dataset's correlation matrix atom.plot_correlation() # Apply PCA for dimensionality reduction atom.feature_selection(strategy=\"pca\", n_features=6) Fitting FeatureSelector... Performing feature selection ... --> Applying Principal Component Analysis... >>> Scaling features... >>> Total explained variance: 0.976 # Use the plotting methods to see the retained variance ratio atom.plot_pca() atom.plot_components(figsize=(8, 6), filename='atom_PCA_plot')","title":"Load the data"},{"location":"examples/regression/regression/#run-the-pipeline","text":"atom.run(['Tree', 'Bag', 'ET'], metric='MSE', n_calls=5, n_initial_points=2, bo_params={'base_estimator': 'GBRT', 'cv': 1}, bagging=5) Running pipeline ============================= >> Models in pipeline: Tree, Bag, ET Metric: neg_mean_squared_error Running BO for Decision Tree... Random start 1 ---------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'random', 'max_depth': 5, 'max_features': 0.9, 'min_samples_split': 8, 'min_samples_leaf': 19, 'ccp_alpha': 0.003} Evaluation --> neg_mean_squared_error: -7.8759 Best neg_mean_squared_error: -7.8759 Time iteration: 0.043s Total time: 0.048s Random start 2 ---------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'best', 'max_depth': 10, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.033} Evaluation --> neg_mean_squared_error: -9.1854 Best neg_mean_squared_error: -7.8759 Time iteration: 0.181s Total time: 0.233s Iteration 3 ------------------------------------- Parameters --> {'criterion': 'friedman_mse', 'splitter': 'random', 'max_depth': 7, 'max_features': 0.6, 'min_samples_split': 17, 'min_samples_leaf': 19, 'ccp_alpha': 0.015} Evaluation --> neg_mean_squared_error: -8.2130 Best neg_mean_squared_error: -7.8759 Time iteration: 0.007s Total time: 0.428s Iteration 4 ------------------------------------- Parameters --> {'criterion': 'friedman_mse', 'splitter': 'best', 'max_depth': 4, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.006} Evaluation --> neg_mean_squared_error: -6.7540 Best neg_mean_squared_error: -6.7540 Time iteration: 0.010s Total time: 0.533s Iteration 5 ------------------------------------- Parameters --> {'criterion': 'mae', 'splitter': 'best', 'max_depth': 3, 'max_features': 0.9, 'min_samples_split': 7, 'min_samples_leaf': 6, 'ccp_alpha': 0.007} Evaluation --> neg_mean_squared_error: -7.2855 Best neg_mean_squared_error: -6.7540 Time iteration: 0.132s Total time: 0.757s Results for Decision Tree: Bayesian Optimization --------------------------- Best parameters --> {'criterion': 'friedman_mse', 'splitter': 'best', 'max_depth': 4, 'max_features': 0.9, 'min_samples_split': 3, 'min_samples_leaf': 12, 'ccp_alpha': 0.006} Best evaluation --> neg_mean_squared_error: -6.7540 Time elapsed: 0.855s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -6.3636 Score on the test set --> neg_mean_squared_error: -5.4433 Time elapsed: 0.011s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -5.5541 \u00b1 0.1150 Time elapsed: 0.039s ------------------------------------------------- Total time: 0.910s Running BO for Bagging Regressor... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 112, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -5.7680 Best neg_mean_squared_error: -5.7680 Time iteration: 0.877s Total time: 0.881s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 131, 'max_samples': 0.5, 'max_features': 0.5, 'bootstrap': False, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -6.8254 Best neg_mean_squared_error: -5.7680 Time iteration: 0.585s Total time: 1.471s Iteration 3 ------------------------------------- Parameters --> {'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': True} Evaluation --> neg_mean_squared_error: -5.4895 Best neg_mean_squared_error: -5.4895 Time iteration: 0.389s Total time: 1.953s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 74, 'max_samples': 0.5, 'max_features': 0.5, 'bootstrap': False, 'bootstrap_features': True} Evaluation --> neg_mean_squared_error: -6.0363 Best neg_mean_squared_error: -5.4895 Time iteration: 0.330s Total time: 2.381s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 36, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': True, 'bootstrap_features': False} Evaluation --> neg_mean_squared_error: -6.0037 Best neg_mean_squared_error: -5.4895 Time iteration: 0.194s Total time: 2.668s Results for Bagging Regressor: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.6, 'bootstrap': False, 'bootstrap_features': True} Best evaluation --> neg_mean_squared_error: -5.4895 Time elapsed: 2.764s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -0.0867 Score on the test set --> neg_mean_squared_error: -4.9533 Time elapsed: 0.571s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -5.2363 \u00b1 0.1099 Time elapsed: 2.325s ------------------------------------------------- Total time: 5.662s Running BO for Extra-Trees... Random start 1 ---------------------------------- Parameters --> {'n_estimators': 112, 'max_depth': 6, 'max_features': 1.0, 'criterion': 'mae', 'min_samples_split': 8, 'min_samples_leaf': 19, 'ccp_alpha': 0.003, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.1995 Best neg_mean_squared_error: -7.1995 Time iteration: 1.034s Total time: 1.040s Random start 2 ---------------------------------- Parameters --> {'n_estimators': 369, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 13, 'min_samples_leaf': 6, 'ccp_alpha': 0.0, 'bootstrap': False} Evaluation --> neg_mean_squared_error: -6.9525 Best neg_mean_squared_error: -6.9525 Time iteration: 0.495s Total time: 1.538s Iteration 3 ------------------------------------- Parameters --> {'n_estimators': 481, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 7, 'min_samples_leaf': 2, 'ccp_alpha': 0.001, 'bootstrap': False} Evaluation --> neg_mean_squared_error: -5.0279 Best neg_mean_squared_error: -5.0279 Time iteration: 0.744s Total time: 2.388s Iteration 4 ------------------------------------- Parameters --> {'n_estimators': 460, 'max_depth': 5, 'max_features': 1.0, 'criterion': 'mae', 'min_samples_split': 5, 'min_samples_leaf': 4, 'ccp_alpha': 0.034, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.3319 Best neg_mean_squared_error: -5.0279 Time iteration: 5.020s Total time: 7.517s Iteration 5 ------------------------------------- Parameters --> {'n_estimators': 474, 'max_depth': 4, 'max_features': 0.8, 'criterion': 'mae', 'min_samples_split': 20, 'min_samples_leaf': 1, 'ccp_alpha': 0.018, 'bootstrap': True, 'max_samples': 0.6} Evaluation --> neg_mean_squared_error: -7.5183 Best neg_mean_squared_error: -5.0279 Time iteration: 4.067s Total time: 11.690s Results for Extra-Trees: Bayesian Optimization --------------------------- Best parameters --> {'n_estimators': 481, 'max_depth': 10, 'max_features': 0.8, 'criterion': 'mse', 'min_samples_split': 7, 'min_samples_leaf': 2, 'ccp_alpha': 0.001, 'bootstrap': False} Best evaluation --> neg_mean_squared_error: -5.0279 Time elapsed: 11.801s Fitting ----------------------------------------- Score on the train set --> neg_mean_squared_error: -4.5366 Score on the test set --> neg_mean_squared_error: -4.4905 Time elapsed: 0.968s Bagging ----------------------------------------- Score --> neg_mean_squared_error: -4.5803 \u00b1 0.0691 Time elapsed: 4.259s ------------------------------------------------- Total time: 17.032s Final results ========================= >> Duration: 23.606s ------------------------------------------ Decision Tree --> neg_mean_squared_error: -5.554 \u00b1 0.115 ~ Bagging Regressor --> neg_mean_squared_error: -5.236 \u00b1 0.110 ~ Extra-Trees --> neg_mean_squared_error: -4.580 \u00b1 0.069 ~ !","title":"Run the pipeline"},{"location":"examples/regression/regression/#analyze-the-results","text":"# For regression tasks, use the errors or residuals plots to check the model performances atom.plot_residuals() # Use the partial dependence plot to analyze the relation between the target response and the features atom.n_jobs = 8 # The method can be slow... atom.ET.plot_partial_dependence(features=(0, 1, (2, 3)), figsize=(12, 8))","title":"Analyze the results"},{"location":"examples/successive_halving/successive_halving/","text":"Successive halving This example shows how to compare multiple tree-based models using successive halving. Import the boston dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict house prices. Load the data # Import packages from sklearn.datasets import load_boston from atom import ATOMRegressor # Load the dataset's features and targets X, y = load_boston(return_X_y=True) Run the pipeline atom = ATOMRegressor(X, y, verbose=1, random_state=1) << ================== ATOM ================== >> Algorithm task: regression. Applying data cleaning... Dataset stats ================= >> Shape: (506, 14) Scaled: False ---------------------------------- Train set size: 405 Test set size: 101 # We can compare tree-based models via successive halving atom.successive_halving(['tree', 'bag', 'et', 'rf', 'lgb', 'catb'], metric='mae', bagging=5) Running pipeline ============================= >> Metric: neg_mean_absolute_error Run 0 (17% of set) ============================>> Models in pipeline: Tree, Bag, ET, RF, LGB, CatB Size of training set: 67 Size of test set: 101 Results for Decision Tree: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -3.3257 Time elapsed: 0.007s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -4.3307 \u00b1 0.5250 Time elapsed: 0.018s ------------------------------------------------- Total time: 0.027s Results for Bagging Regressor: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -1.3054 Score on the test set --> neg_mean_absolute_error: -2.6950 Time elapsed: 0.018s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -3.0957 \u00b1 0.2677 Time elapsed: 0.079s ------------------------------------------------- Total time: 0.100s Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -2.1541 Time elapsed: 0.084s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.5554 \u00b1 0.1708 Time elapsed: 0.357s ------------------------------------------------- Total time: 0.443s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -1.1509 Score on the test set --> neg_mean_absolute_error: -2.4143 Time elapsed: 0.109s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.9574 \u00b1 0.2253 Time elapsed: 0.509s ------------------------------------------------- Total time: 0.621s Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -3.4205 Score on the test set --> neg_mean_absolute_error: -4.5600 Time elapsed: 0.027s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -4.8393 \u00b1 0.2682 Time elapsed: 0.060s ------------------------------------------------- Total time: 0.091s Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0806 Score on the test set --> neg_mean_absolute_error: -2.3984 Time elapsed: 0.846s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.9165 \u00b1 0.2564 Time elapsed: 2.764s ------------------------------------------------- Total time: 3.611s Final results ========================= >> Duration: 4.894s ------------------------------------------ Decision Tree --> neg_mean_absolute_error: -4.331 \u00b1 0.525 ~ Bagging Regressor --> neg_mean_absolute_error: -3.096 \u00b1 0.268 ~ Extra-Trees --> neg_mean_absolute_error: -2.555 \u00b1 0.171 ~ ! Random Forest --> neg_mean_absolute_error: -2.957 \u00b1 0.225 ~ LightGBM --> neg_mean_absolute_error: -4.839 \u00b1 0.268 ~ CatBoost --> neg_mean_absolute_error: -2.916 \u00b1 0.256 ~ Run 1 (33% of set) ============================>> Models in pipeline: ET, CatB, RF Size of training set: 135 Size of test set: 101 Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -2.2361 Time elapsed: 0.098s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.6016 \u00b1 0.2890 Time elapsed: 0.414s ------------------------------------------------- Total time: 0.514s Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.2835 Score on the test set --> neg_mean_absolute_error: -2.4196 Time elapsed: 0.815s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.5681 \u00b1 0.2119 Time elapsed: 3.124s ------------------------------------------------- Total time: 3.942s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.9820 Score on the test set --> neg_mean_absolute_error: -2.5055 Time elapsed: 0.129s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.6144 \u00b1 0.1188 Time elapsed: 0.590s ------------------------------------------------- Total time: 0.721s Final results ========================= >> Duration: 5.178s ------------------------------------------ Extra-Trees --> neg_mean_absolute_error: -2.602 \u00b1 0.289 ~ CatBoost --> neg_mean_absolute_error: -2.568 \u00b1 0.212 ~ ! Random Forest --> neg_mean_absolute_error: -2.614 \u00b1 0.119 ~ Run 2 (100% of set) ===========================>> Models in pipeline: CatB Size of training set: 405 Size of test set: 101 Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.3978 Score on the test set --> neg_mean_absolute_error: -1.8772 Time elapsed: 1.207s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.0501 \u00b1 0.0892 Time elapsed: 5.234s ------------------------------------------------- Total time: 6.444s Final results ========================= >> Duration: 6.445s ------------------------------------------ CatBoost --> neg_mean_absolute_error: -2.050 \u00b1 0.089 ~ Analyze results # Note that the results dataframe now is multi-index atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name metric_train metric_test time_fit mean_bagging std_bagging time_bagging time run model 0 Tree Decision Tree -0.000000e+00 -3.325743 0.007s -4.330693 0.525026 0.018s 0.027s Bag Bagging Regressor -1.305373e+00 -2.695050 0.018s -3.095663 0.267668 0.079s 0.100s ET Extra-Trees -2.256238e-14 -2.154089 0.084s -2.555434 0.170823 0.357s 0.443s RF Random Forest -1.150866e+00 -2.414297 0.109s -2.957400 0.225311 0.509s 0.621s LGB LightGBM -3.420518e+00 -4.559962 0.027s -4.839315 0.268167 0.060s 0.091s CatB CatBoost -8.055503e-02 -2.398431 0.846s -2.916470 0.256428 2.764s 3.611s 1 ET Extra-Trees -2.315185e-14 -2.236079 0.098s -2.601648 0.289034 0.414s 0.514s CatB CatBoost -2.835499e-01 -2.419625 0.815s -2.568085 0.211868 3.124s 3.942s RF Random Forest -9.819778e-01 -2.505465 0.129s -2.614416 0.118758 0.590s 0.721s 2 CatB CatBoost -3.977985e-01 -1.877205 1.207s -2.050118 0.089185 5.234s 6.444s # Plot the successive halving's results atom.plot_successive_halving()","title":"Successive halving"},{"location":"examples/successive_halving/successive_halving/#successive-halving","text":"This example shows how to compare multiple tree-based models using successive halving. Import the boston dataset from sklearn.datasets . This is a small and easy to train dataset whose goal is to predict house prices.","title":"Successive halving"},{"location":"examples/successive_halving/successive_halving/#load-the-data","text":"# Import packages from sklearn.datasets import load_boston from atom import ATOMRegressor # Load the dataset's features and targets X, y = load_boston(return_X_y=True)","title":"Load the data"},{"location":"examples/successive_halving/successive_halving/#run-the-pipeline","text":"atom = ATOMRegressor(X, y, verbose=1, random_state=1) << ================== ATOM ================== >> Algorithm task: regression. Applying data cleaning... Dataset stats ================= >> Shape: (506, 14) Scaled: False ---------------------------------- Train set size: 405 Test set size: 101 # We can compare tree-based models via successive halving atom.successive_halving(['tree', 'bag', 'et', 'rf', 'lgb', 'catb'], metric='mae', bagging=5) Running pipeline ============================= >> Metric: neg_mean_absolute_error Run 0 (17% of set) ============================>> Models in pipeline: Tree, Bag, ET, RF, LGB, CatB Size of training set: 67 Size of test set: 101 Results for Decision Tree: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -3.3257 Time elapsed: 0.007s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -4.3307 \u00b1 0.5250 Time elapsed: 0.018s ------------------------------------------------- Total time: 0.027s Results for Bagging Regressor: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -1.3054 Score on the test set --> neg_mean_absolute_error: -2.6950 Time elapsed: 0.018s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -3.0957 \u00b1 0.2677 Time elapsed: 0.079s ------------------------------------------------- Total time: 0.100s Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -2.1541 Time elapsed: 0.084s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.5554 \u00b1 0.1708 Time elapsed: 0.357s ------------------------------------------------- Total time: 0.443s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -1.1509 Score on the test set --> neg_mean_absolute_error: -2.4143 Time elapsed: 0.109s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.9574 \u00b1 0.2253 Time elapsed: 0.509s ------------------------------------------------- Total time: 0.621s Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -3.4205 Score on the test set --> neg_mean_absolute_error: -4.5600 Time elapsed: 0.027s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -4.8393 \u00b1 0.2682 Time elapsed: 0.060s ------------------------------------------------- Total time: 0.091s Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0806 Score on the test set --> neg_mean_absolute_error: -2.3984 Time elapsed: 0.846s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.9165 \u00b1 0.2564 Time elapsed: 2.764s ------------------------------------------------- Total time: 3.611s Final results ========================= >> Duration: 4.894s ------------------------------------------ Decision Tree --> neg_mean_absolute_error: -4.331 \u00b1 0.525 ~ Bagging Regressor --> neg_mean_absolute_error: -3.096 \u00b1 0.268 ~ Extra-Trees --> neg_mean_absolute_error: -2.555 \u00b1 0.171 ~ ! Random Forest --> neg_mean_absolute_error: -2.957 \u00b1 0.225 ~ LightGBM --> neg_mean_absolute_error: -4.839 \u00b1 0.268 ~ CatBoost --> neg_mean_absolute_error: -2.916 \u00b1 0.256 ~ Run 1 (33% of set) ============================>> Models in pipeline: ET, CatB, RF Size of training set: 135 Size of test set: 101 Results for Extra-Trees: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.0000 Score on the test set --> neg_mean_absolute_error: -2.2361 Time elapsed: 0.098s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.6016 \u00b1 0.2890 Time elapsed: 0.414s ------------------------------------------------- Total time: 0.514s Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.2835 Score on the test set --> neg_mean_absolute_error: -2.4196 Time elapsed: 0.815s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.5681 \u00b1 0.2119 Time elapsed: 3.124s ------------------------------------------------- Total time: 3.942s Results for Random Forest: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.9820 Score on the test set --> neg_mean_absolute_error: -2.5055 Time elapsed: 0.129s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.6144 \u00b1 0.1188 Time elapsed: 0.590s ------------------------------------------------- Total time: 0.721s Final results ========================= >> Duration: 5.178s ------------------------------------------ Extra-Trees --> neg_mean_absolute_error: -2.602 \u00b1 0.289 ~ CatBoost --> neg_mean_absolute_error: -2.568 \u00b1 0.212 ~ ! Random Forest --> neg_mean_absolute_error: -2.614 \u00b1 0.119 ~ Run 2 (100% of set) ===========================>> Models in pipeline: CatB Size of training set: 405 Size of test set: 101 Results for CatBoost: Fitting ----------------------------------------- Score on the train set --> neg_mean_absolute_error: -0.3978 Score on the test set --> neg_mean_absolute_error: -1.8772 Time elapsed: 1.207s Bagging ----------------------------------------- Score --> neg_mean_absolute_error: -2.0501 \u00b1 0.0892 Time elapsed: 5.234s ------------------------------------------------- Total time: 6.444s Final results ========================= >> Duration: 6.445s ------------------------------------------ CatBoost --> neg_mean_absolute_error: -2.050 \u00b1 0.089 ~","title":"Run the pipeline"},{"location":"examples/successive_halving/successive_halving/#analyze-results","text":"# Note that the results dataframe now is multi-index atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name metric_train metric_test time_fit mean_bagging std_bagging time_bagging time run model 0 Tree Decision Tree -0.000000e+00 -3.325743 0.007s -4.330693 0.525026 0.018s 0.027s Bag Bagging Regressor -1.305373e+00 -2.695050 0.018s -3.095663 0.267668 0.079s 0.100s ET Extra-Trees -2.256238e-14 -2.154089 0.084s -2.555434 0.170823 0.357s 0.443s RF Random Forest -1.150866e+00 -2.414297 0.109s -2.957400 0.225311 0.509s 0.621s LGB LightGBM -3.420518e+00 -4.559962 0.027s -4.839315 0.268167 0.060s 0.091s CatB CatBoost -8.055503e-02 -2.398431 0.846s -2.916470 0.256428 2.764s 3.611s 1 ET Extra-Trees -2.315185e-14 -2.236079 0.098s -2.601648 0.289034 0.414s 0.514s CatB CatBoost -2.835499e-01 -2.419625 0.815s -2.568085 0.211868 3.124s 3.942s RF Random Forest -9.819778e-01 -2.505465 0.129s -2.614416 0.118758 0.590s 0.721s 2 CatB CatBoost -3.977985e-01 -1.877205 1.207s -2.050118 0.089185 5.234s 6.444s # Plot the successive halving's results atom.plot_successive_halving()","title":"Analyze results"},{"location":"examples/train_sizing/train_sizing/","text":"Train sizing This example shows how to asses a model's performance based on the size of the training set. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package. The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow. Load the data # Import packages import numpy as np import pandas as pd from atom import ATOMClassifier # Load the Australian weather dataset X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 3118 BadgerysCreek 11.7 23.2 0.0 NaN NaN SW 28.0 18965 NorahHead 10.2 19.4 0.0 NaN NaN SSE 30.0 11196 CoffsHarbour 9.7 21.2 0.0 NaN NaN NW 26.0 62283 Sale 8.4 21.7 0.0 NaN NaN WSW 41.0 92461 Townsville 11.1 27.1 0.0 7.6 10.7 ENE 37.0 Run the pipeline # Initialize ATOM and prepare the data atom = ATOMClassifier(X, verbose=2, random_state=1) atom.impute(strat_num='median', strat_cat='most_frequent', min_frac_rows=0.8) atom.encode() << ================== ATOM ================== >> Algorithm task: binary classification. Applying data cleaning... Dataset stats ================= >> Shape: (142193, 22) Missing values: 292032 Categorical columns: 5 Scaled: False ---------------------------------- Size of training set: 113755 Size of test set: 28438 ---------------------------------- Class balance: No:Yes <==> 3.5:1.0 Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 110316 | 88263 | 22053 | | 1: Yes | 31877 | 25492 | 6385 | Fitting Imputer... Imputing missing values... --> Dropping 15182 rows for containing less than 80% non-missing values. --> Imputing 100 missing values with median in feature MinTemp. --> Imputing 57 missing values with median in feature MaxTemp. --> Imputing 640 missing values with median in feature Rainfall. --> Imputing 46535 missing values with median in feature Evaporation. --> Imputing 53034 missing values with median in feature Sunshine. --> Imputing 4381 missing values with most_frequent in feature WindGustDir. --> Imputing 4359 missing values with median in feature WindGustSpeed. --> Imputing 6624 missing values with most_frequent in feature WindDir9am. --> Imputing 612 missing values with most_frequent in feature WindDir3pm. --> Imputing 80 missing values with median in feature WindSpeed9am. --> Imputing 49 missing values with median in feature WindSpeed3pm. --> Imputing 532 missing values with median in feature Humidity9am. --> Imputing 1168 missing values with median in feature Humidity3pm. --> Imputing 1028 missing values with median in feature Pressure9am. --> Imputing 972 missing values with median in feature Pressure3pm. --> Imputing 42172 missing values with median in feature Cloud9am. --> Imputing 44251 missing values with median in feature Cloud3pm. --> Imputing 98 missing values with median in feature Temp9am. --> Imputing 702 missing values with median in feature Temp3pm. --> Imputing 640 missing values with most_frequent in feature RainToday. Fitting Encoder... Encoding categorical columns... --> Target-encoding feature Location. Contains 45 unique categories. --> Target-encoding feature WindGustDir. Contains 16 unique categories. --> Target-encoding feature WindDir9am. Contains 16 unique categories. --> Target-encoding feature WindDir3pm. Contains 16 unique categories. --> Label-encoding feature RainToday. Contains 2 unique categories. # We can analyze the impact of the training set's size on a LightGBM model atom.train_sizing('lgb', train_sizes=np.linspace(0.1, 1, 9), bagging=4) Running pipeline ============================= >> Models in pipeline: LGB Metric: f1 Run 0 (10% of set) ============================>> Size of training set: 11375 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.8029 Score on the test set --> f1: 0.6086 Time elapsed: 0.998s Bagging ----------------------------------------- Score --> f1: 0.5945 \u00b1 0.0073 Time elapsed: 2.229s ------------------------------------------------- Total time: 3.242s Final results ========================= >> Duration: 3.244s ------------------------------------------ LightGBM --> f1: 0.594 \u00b1 0.007 ~ Run 1 (21% of set) ============================>> Size of training set: 24172 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.7292 Score on the test set --> f1: 0.6273 Time elapsed: 1.244s Bagging ----------------------------------------- Score --> f1: 0.6166 \u00b1 0.0053 Time elapsed: 2.879s ------------------------------------------------- Total time: 4.129s Final results ========================= >> Duration: 4.131s ------------------------------------------ LightGBM --> f1: 0.617 \u00b1 0.005 Run 2 (32% of set) ============================>> Size of training set: 36970 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6955 Score on the test set --> f1: 0.6325 Time elapsed: 1.533s Bagging ----------------------------------------- Score --> f1: 0.6199 \u00b1 0.0038 Time elapsed: 3.502s ------------------------------------------------- Total time: 5.039s Final results ========================= >> Duration: 5.042s ------------------------------------------ LightGBM --> f1: 0.620 \u00b1 0.004 Run 3 (44% of set) ============================>> Size of training set: 49767 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6832 Score on the test set --> f1: 0.6386 Time elapsed: 1.825s Bagging ----------------------------------------- Score --> f1: 0.6256 \u00b1 0.0036 Time elapsed: 4.148s ------------------------------------------------- Total time: 5.979s Final results ========================= >> Duration: 5.981s ------------------------------------------ LightGBM --> f1: 0.626 \u00b1 0.004 Run 4 (55% of set) ============================>> Size of training set: 62565 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6818 Score on the test set --> f1: 0.6391 Time elapsed: 2.152s Bagging ----------------------------------------- Score --> f1: 0.6271 \u00b1 0.0025 Time elapsed: 4.838s ------------------------------------------------- Total time: 6.996s Final results ========================= >> Duration: 6.998s ------------------------------------------ LightGBM --> f1: 0.627 \u00b1 0.002 Run 5 (66% of set) ============================>> Size of training set: 75362 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6767 Score on the test set --> f1: 0.6399 Time elapsed: 2.418s Bagging ----------------------------------------- Score --> f1: 0.6346 \u00b1 0.0021 Time elapsed: 5.622s ------------------------------------------------- Total time: 8.045s Final results ========================= >> Duration: 8.047s ------------------------------------------ LightGBM --> f1: 0.635 \u00b1 0.002 Run 6 (77% of set) ============================>> Size of training set: 88160 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6665 Score on the test set --> f1: 0.6384 Time elapsed: 2.810s Bagging ----------------------------------------- Score --> f1: 0.6342 \u00b1 0.0021 Time elapsed: 6.240s ------------------------------------------------- Total time: 9.058s Final results ========================= >> Duration: 9.060s ------------------------------------------ LightGBM --> f1: 0.634 \u00b1 0.002 Run 7 (89% of set) ============================>> Size of training set: 100957 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6651 Score on the test set --> f1: 0.6432 Time elapsed: 3.063s Bagging ----------------------------------------- Score --> f1: 0.6372 \u00b1 0.0025 Time elapsed: 6.888s ------------------------------------------------- Total time: 9.958s Final results ========================= >> Duration: 9.960s ------------------------------------------ LightGBM --> f1: 0.637 \u00b1 0.003 Run 8 (100% of set) ===========================>> Size of training set: 113755 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6650 Score on the test set --> f1: 0.6549 Time elapsed: 3.379s Bagging ----------------------------------------- Score --> f1: 0.6508 \u00b1 0.0026 Time elapsed: 7.621s ------------------------------------------------- Total time: 11.009s Final results ========================= >> Duration: 11.012s ------------------------------------------ LightGBM --> f1: 0.651 \u00b1 0.003 Analyze the results # Note that the results dataframe now is multi-index atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name score_train score_test time_fit mean_bagging std_bagging time_bagging time run model 0 LGB LightGBM 0.802859 0.608590 0.998s 0.594472 0.007341 2.229s 3.242s 1 LGB LightGBM 0.729212 0.627277 1.244s 0.616583 0.005321 2.879s 4.129s 2 LGB LightGBM 0.695463 0.632544 1.533s 0.619899 0.003822 3.502s 5.039s 3 LGB LightGBM 0.683228 0.638575 1.825s 0.625589 0.003608 4.148s 5.979s 4 LGB LightGBM 0.681811 0.639062 2.152s 0.627105 0.002460 4.838s 6.996s 5 LGB LightGBM 0.676747 0.639897 2.418s 0.634642 0.002138 5.622s 8.045s 6 LGB LightGBM 0.666471 0.638376 2.810s 0.634245 0.002098 6.240s 9.058s 7 LGB LightGBM 0.665065 0.643197 3.063s 0.637232 0.002537 6.888s 9.958s 8 LGB LightGBM 0.665018 0.654904 3.379s 0.650772 0.002577 7.621s 11.009s # Plot the train sizing's results atom.plot_learning_curve()","title":"Train sizing"},{"location":"examples/train_sizing/train_sizing/#train-sizing","text":"This example shows how to asses a model's performance based on the size of the training set. The data used is a variation on the Australian weather dataset from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package. The goal of this dataset is to predict whether or not it will rain tomorrow training a binay classifier on target RainTomorrow.","title":"Train sizing"},{"location":"examples/train_sizing/train_sizing/#load-the-data","text":"# Import packages import numpy as np import pandas as pd from atom import ATOMClassifier # Load the Australian weather dataset X = pd.read_csv('./datasets/weatherAUS.csv') # Let's have a look at a subset of the data X.sample(frac=1).iloc[:5, :8] .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed 3118 BadgerysCreek 11.7 23.2 0.0 NaN NaN SW 28.0 18965 NorahHead 10.2 19.4 0.0 NaN NaN SSE 30.0 11196 CoffsHarbour 9.7 21.2 0.0 NaN NaN NW 26.0 62283 Sale 8.4 21.7 0.0 NaN NaN WSW 41.0 92461 Townsville 11.1 27.1 0.0 7.6 10.7 ENE 37.0","title":"Load the data"},{"location":"examples/train_sizing/train_sizing/#run-the-pipeline","text":"# Initialize ATOM and prepare the data atom = ATOMClassifier(X, verbose=2, random_state=1) atom.impute(strat_num='median', strat_cat='most_frequent', min_frac_rows=0.8) atom.encode() << ================== ATOM ================== >> Algorithm task: binary classification. Applying data cleaning... Dataset stats ================= >> Shape: (142193, 22) Missing values: 292032 Categorical columns: 5 Scaled: False ---------------------------------- Size of training set: 113755 Size of test set: 28438 ---------------------------------- Class balance: No:Yes <==> 3.5:1.0 Instances in RainTomorrow per class: | | total | train_set | test_set | |:-------|---------:|-------------:|------------:| | 0: No | 110316 | 88263 | 22053 | | 1: Yes | 31877 | 25492 | 6385 | Fitting Imputer... Imputing missing values... --> Dropping 15182 rows for containing less than 80% non-missing values. --> Imputing 100 missing values with median in feature MinTemp. --> Imputing 57 missing values with median in feature MaxTemp. --> Imputing 640 missing values with median in feature Rainfall. --> Imputing 46535 missing values with median in feature Evaporation. --> Imputing 53034 missing values with median in feature Sunshine. --> Imputing 4381 missing values with most_frequent in feature WindGustDir. --> Imputing 4359 missing values with median in feature WindGustSpeed. --> Imputing 6624 missing values with most_frequent in feature WindDir9am. --> Imputing 612 missing values with most_frequent in feature WindDir3pm. --> Imputing 80 missing values with median in feature WindSpeed9am. --> Imputing 49 missing values with median in feature WindSpeed3pm. --> Imputing 532 missing values with median in feature Humidity9am. --> Imputing 1168 missing values with median in feature Humidity3pm. --> Imputing 1028 missing values with median in feature Pressure9am. --> Imputing 972 missing values with median in feature Pressure3pm. --> Imputing 42172 missing values with median in feature Cloud9am. --> Imputing 44251 missing values with median in feature Cloud3pm. --> Imputing 98 missing values with median in feature Temp9am. --> Imputing 702 missing values with median in feature Temp3pm. --> Imputing 640 missing values with most_frequent in feature RainToday. Fitting Encoder... Encoding categorical columns... --> Target-encoding feature Location. Contains 45 unique categories. --> Target-encoding feature WindGustDir. Contains 16 unique categories. --> Target-encoding feature WindDir9am. Contains 16 unique categories. --> Target-encoding feature WindDir3pm. Contains 16 unique categories. --> Label-encoding feature RainToday. Contains 2 unique categories. # We can analyze the impact of the training set's size on a LightGBM model atom.train_sizing('lgb', train_sizes=np.linspace(0.1, 1, 9), bagging=4) Running pipeline ============================= >> Models in pipeline: LGB Metric: f1 Run 0 (10% of set) ============================>> Size of training set: 11375 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.8029 Score on the test set --> f1: 0.6086 Time elapsed: 0.998s Bagging ----------------------------------------- Score --> f1: 0.5945 \u00b1 0.0073 Time elapsed: 2.229s ------------------------------------------------- Total time: 3.242s Final results ========================= >> Duration: 3.244s ------------------------------------------ LightGBM --> f1: 0.594 \u00b1 0.007 ~ Run 1 (21% of set) ============================>> Size of training set: 24172 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.7292 Score on the test set --> f1: 0.6273 Time elapsed: 1.244s Bagging ----------------------------------------- Score --> f1: 0.6166 \u00b1 0.0053 Time elapsed: 2.879s ------------------------------------------------- Total time: 4.129s Final results ========================= >> Duration: 4.131s ------------------------------------------ LightGBM --> f1: 0.617 \u00b1 0.005 Run 2 (32% of set) ============================>> Size of training set: 36970 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6955 Score on the test set --> f1: 0.6325 Time elapsed: 1.533s Bagging ----------------------------------------- Score --> f1: 0.6199 \u00b1 0.0038 Time elapsed: 3.502s ------------------------------------------------- Total time: 5.039s Final results ========================= >> Duration: 5.042s ------------------------------------------ LightGBM --> f1: 0.620 \u00b1 0.004 Run 3 (44% of set) ============================>> Size of training set: 49767 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6832 Score on the test set --> f1: 0.6386 Time elapsed: 1.825s Bagging ----------------------------------------- Score --> f1: 0.6256 \u00b1 0.0036 Time elapsed: 4.148s ------------------------------------------------- Total time: 5.979s Final results ========================= >> Duration: 5.981s ------------------------------------------ LightGBM --> f1: 0.626 \u00b1 0.004 Run 4 (55% of set) ============================>> Size of training set: 62565 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6818 Score on the test set --> f1: 0.6391 Time elapsed: 2.152s Bagging ----------------------------------------- Score --> f1: 0.6271 \u00b1 0.0025 Time elapsed: 4.838s ------------------------------------------------- Total time: 6.996s Final results ========================= >> Duration: 6.998s ------------------------------------------ LightGBM --> f1: 0.627 \u00b1 0.002 Run 5 (66% of set) ============================>> Size of training set: 75362 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6767 Score on the test set --> f1: 0.6399 Time elapsed: 2.418s Bagging ----------------------------------------- Score --> f1: 0.6346 \u00b1 0.0021 Time elapsed: 5.622s ------------------------------------------------- Total time: 8.045s Final results ========================= >> Duration: 8.047s ------------------------------------------ LightGBM --> f1: 0.635 \u00b1 0.002 Run 6 (77% of set) ============================>> Size of training set: 88160 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6665 Score on the test set --> f1: 0.6384 Time elapsed: 2.810s Bagging ----------------------------------------- Score --> f1: 0.6342 \u00b1 0.0021 Time elapsed: 6.240s ------------------------------------------------- Total time: 9.058s Final results ========================= >> Duration: 9.060s ------------------------------------------ LightGBM --> f1: 0.634 \u00b1 0.002 Run 7 (89% of set) ============================>> Size of training set: 100957 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6651 Score on the test set --> f1: 0.6432 Time elapsed: 3.063s Bagging ----------------------------------------- Score --> f1: 0.6372 \u00b1 0.0025 Time elapsed: 6.888s ------------------------------------------------- Total time: 9.958s Final results ========================= >> Duration: 9.960s ------------------------------------------ LightGBM --> f1: 0.637 \u00b1 0.003 Run 8 (100% of set) ===========================>> Size of training set: 113755 Size of test set: 28438 Results for LightGBM: Fitting ----------------------------------------- Score on the train set --> f1: 0.6650 Score on the test set --> f1: 0.6549 Time elapsed: 3.379s Bagging ----------------------------------------- Score --> f1: 0.6508 \u00b1 0.0026 Time elapsed: 7.621s ------------------------------------------------- Total time: 11.009s Final results ========================= >> Duration: 11.012s ------------------------------------------ LightGBM --> f1: 0.651 \u00b1 0.003","title":"Run the pipeline"},{"location":"examples/train_sizing/train_sizing/#analyze-the-results","text":"# Note that the results dataframe now is multi-index atom.results .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } name score_train score_test time_fit mean_bagging std_bagging time_bagging time run model 0 LGB LightGBM 0.802859 0.608590 0.998s 0.594472 0.007341 2.229s 3.242s 1 LGB LightGBM 0.729212 0.627277 1.244s 0.616583 0.005321 2.879s 4.129s 2 LGB LightGBM 0.695463 0.632544 1.533s 0.619899 0.003822 3.502s 5.039s 3 LGB LightGBM 0.683228 0.638575 1.825s 0.625589 0.003608 4.148s 5.979s 4 LGB LightGBM 0.681811 0.639062 2.152s 0.627105 0.002460 4.838s 6.996s 5 LGB LightGBM 0.676747 0.639897 2.418s 0.634642 0.002138 5.622s 8.045s 6 LGB LightGBM 0.666471 0.638376 2.810s 0.634245 0.002098 6.240s 9.058s 7 LGB LightGBM 0.665065 0.643197 3.063s 0.637232 0.002537 6.888s 9.958s 8 LGB LightGBM 0.665018 0.654904 3.379s 0.650772 0.002577 7.621s 11.009s # Plot the train sizing's results atom.plot_learning_curve()","title":"Analyze the results"}]} \ No newline at end of file diff --git a/docs/sitemap.xml.gz b/docs/sitemap.xml.gz index 13610a78fe1b57468c9bbb02bbf40c4d8f8a1f4f..a0ae2eee02fbb38c8ee95c9c3f768fdf9e5025c3 100644 GIT binary patch delta 15 WcmaFG`ihlJzMF%iQhX!Z6D9yCTraining and train_sizing methods respectively.

-

A couple of things to take into account:

+

Additional information:

  • If an exception is encountered while fitting an estimator, the pipeline will automatically skip the model and jump to the next model and save the exception diff --git a/docs_sources/user_guide.md b/docs_sources/user_guide.md index cb5cbb79a..01e2aadf3 100644 --- a/docs_sources/user_guide.md +++ b/docs_sources/user_guide.md @@ -445,7 +445,7 @@ The direct fashion repeats the aforementioned steps only once, while the other t respectively.
    -A couple of things to take into account: +Additional information: * If an exception is encountered while fitting an estimator, the pipeline will automatically skip the model and jump to the next model and save the exception