Skip to content

Commit

Permalink
Merge pull request #138 from XpressAI/pycaret_regression_xai
Browse files Browse the repository at this point in the history
📉 Pycaret Regression Components + Examples
  • Loading branch information
MFA-X-AI authored Apr 5, 2022
2 parents 585ed17 + 4b6f159 commit 865911c
Show file tree
Hide file tree
Showing 10 changed files with 13,715 additions and 7,159 deletions.
3,247 changes: 1,677 additions & 1,570 deletions examples/AutoMLBasicBinaryClassification.xircuits

Large diffs are not rendered by default.

3,437 changes: 1,756 additions & 1,681 deletions examples/AutoMLBasicMulticlassClassification.xircuits

Large diffs are not rendered by default.

2,565 changes: 2,565 additions & 0 deletions examples/AutoMLBasicRegression.xircuits

Large diffs are not rendered by default.

2,945 changes: 1,529 additions & 1,416 deletions examples/AutoMLClassificationBlendModels.xircuits

Large diffs are not rendered by default.

3,093 changes: 3,093 additions & 0 deletions examples/AutoMLRegressionStackModels.xircuits

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4,692 changes: 2,335 additions & 2,357 deletions tsconfig.tsbuildinfo

Large diffs are not rendered by default.

184 changes: 52 additions & 132 deletions xai_components/xai_pycaret/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,100 +2,14 @@
from IPython.utils import capture


"""
This component loads sample datasets from git repository.
List of available datasets can be checked using get_data('index')
"""
@xai_component
class GetData(Component):
dataset: InArg[str] #Index value of dataset.
save_copy: InArg[bool] #When set to true, it saves a copy in current working directory.
verbose: InArg[bool] #When set to False, head of data is not displayed.

out_dataset : OutArg[any] #Dataset

def __init__(self):

self.done = False
self.dataset = InArg(None)
self.save_copy = InArg(False)
self.verbose = InArg(True)

self.out_dataset = OutArg(None)

def execute(self, ctx) -> None:

from pycaret.datasets import get_data

dataset = self.dataset.value
save_copy = self.save_copy.value
verbose = self.verbose.value

if dataset is None:
dataset = "index"
print("Please choose a dataset...")

load_dataset = get_data(dataset = dataset, save_copy=save_copy, verbose = verbose)
print('Dataset shape: ' + str(load_dataset.shape))

self.out_dataset.value = load_dataset

self.done = True


"""
This component withheld sample from the original dataset to be used for predictions.
This should not be confused with a train/test split as this particular split
is performed to simulate a real life scenario.
"""
@xai_component
class SampleTestData(Component):
in_dataset: InArg[any]
test_fraction: InArg[float] #Fraction of testing dataset size.
seed : InArg[int] #You can use random_state for reproducibility.

train_val_dataset : OutArg[any] #train/val dataset for training and evaluation
test_Dataset: OutArg[any] #test dataset for model prediction


def __init__(self):

self.done = False
self.in_dataset = InArg(None)
self.test_fraction = InArg(0)
self.seed = InArg(None)

self.train_val_dataset = OutArg(None)
self.test_Dataset = OutArg(None)

def execute(self, ctx) -> None:

in_dataset = self.in_dataset.value
test_fraction = self.test_fraction.value
seed = self.seed.value

if seed is None:
print("Set the seed value for reproducibility.")

train_val_dataset = in_dataset.sample(frac=1-test_fraction, random_state=seed)
test_Dataset = in_dataset.drop(train_val_dataset.index)

print('Data for Modeling: ' + str(train_val_dataset.shape))
print('Test Data For Predictions: ' + str(test_Dataset.shape))

self.train_val_dataset.value = train_val_dataset
self.test_Dataset.value = test_Dataset

self.done = True


"""
This component initializes the training environment and creates the transformation pipeline.
Setup component must be called before executing any other component. It takes two mandatory
parameters:data and target. All the other parameters are optional.
"""
@xai_component
class SetupEnvironment(Component):
@xai_component(color="blue")
class SetupClassification(Component):
in_dataset: InArg[any] #Shape (n_samples, n_features), where n_samples is the number of samples and n_features is the number of features
target: InArg[str] #Name of the target column to be passed in as a string. The target variable can be either binary or multiclass.
train_size_fraction : InArg[float] #Proportion of the dataset to be used for training and validation. Should be between 0.0 and 1.0.
Expand All @@ -106,6 +20,7 @@ class SetupEnvironment(Component):
multicollinearity_threshold:InArg[float] #Threshold for correlated features. Ignored when remove_multicollinearity is not True.
bin_numeric_features:InArg[any] #To convert numeric features into categorical,It takes a list of strings with column names that are related.
group_features:InArg[any] #When the dataset contains features with related characteristics, group_features parameter can be used for feature extraction. It takes a list of strings with column names that are related.
ignore_features:InArg[list] #ignore_features param can be used to ignore features during model training. It takes a list of strings with column names that are to be ignored.
seed : InArg[int] #You can use random_state for reproducibility.
log_experiment:InArg[bool] #logging setup and training
experiment_name:InArg[str] #Name of the experiment for logging.
Expand All @@ -124,6 +39,7 @@ def __init__(self):
self.multicollinearity_threshold = InArg(0.9)
self.bin_numeric_features = InArg(None)
self.group_features = InArg(None)
self.ignore_features = InArg(None)
self.seed = InArg(None)
self.log_experiment = InArg(False)
self.experiment_name = InArg('default')
Expand All @@ -143,6 +59,7 @@ def execute(self, ctx) -> None:
multicollinearity_threshold = self.multicollinearity_threshold.value
bin_numeric_features = self.bin_numeric_features.value
group_features = self.group_features.value
ignore_features = self.ignore_features.value
seed = self.seed.value
log_experiment = self.log_experiment.value
experiment_name = self.experiment_name.value
Expand All @@ -162,6 +79,7 @@ def execute(self, ctx) -> None:
multicollinearity_threshold = multicollinearity_threshold,
bin_numeric_features = bin_numeric_features,
group_features = group_features,
ignore_features = ignore_features,
session_id=seed,
log_experiment = log_experiment,
experiment_name = experiment_name,
Expand All @@ -177,9 +95,10 @@ def execute(self, ctx) -> None:
in the model library using cross validation.The output of this component is
a score grid with average cross validated scores.
'''
@xai_component
class CompareModels(Component):
@xai_component(color="firebrick")
class CompareModelsClassification(Component):
sort_by:InArg[str] #The sort order of the score grid.
exclude:InArg[list] #To omit certain models from training and evaluation, pass a list containing model id in the exclude parameter.
num_top:InArg[int] #Number of top_n models to return.

top_models:OutArg[any]
Expand All @@ -188,6 +107,7 @@ def __init__(self):

self.done = False
self.sort_by = InArg('Accuracy')
self.exclude = InArg(None)
self.num_top = InArg(1)

self.top_models = OutArg(None)
Expand All @@ -197,10 +117,11 @@ def execute(self, ctx) -> None:
from pycaret.classification import compare_models

sort_by = self.sort_by.value
exclude = self.exclude.value
num_top = self.num_top.value

with capture.capture_output() as captured:
best_model = compare_models(sort=sort_by,n_select = num_top)
best_model = compare_models(sort=sort_by,exclude = exclude,n_select = num_top)
captured.show()
print('Best '+str(num_top)+' Model:',best_model)

Expand All @@ -213,8 +134,8 @@ def execute(self, ctx) -> None:
using cross validation.The output of this component is a score grid with
CV scores by fold.
'''
@xai_component
class CreateModel(Component):
@xai_component(color="orange")
class CreateModelClassification(Component):
model_id:InArg[str] #ID of an estimator available in model library or pass an untrained model object consistent with scikit-learn API
num_fold:InArg[int] #Controls cross-validation. If None, the CV generator in the fold_strategy parameter of the setup function is used.

Expand Down Expand Up @@ -246,11 +167,11 @@ def execute(self, ctx) -> None:


'''
This component tunes the hyperparameters of a given estimator. The output of this component is
This component tunes the hyperparameters of a given model. The output of this component is
a score grid with CV scores by fold of the best selected model based on optimize parameter.
'''
@xai_component
class TuneModel(Component):
@xai_component(color="salmon")
class TuneModelClassification(Component):
in_model:InArg[any] #Trained model object
optimize:InArg[str] #Metric name to be evaluated for hyperparameter tuning.
early_stopping_patience:InArg[int] #Maximum number of epochs to run for each sampled configuration.
Expand Down Expand Up @@ -313,10 +234,11 @@ def execute(self, ctx) -> None:
This component analyzes the performance of a trained model on holdout set.
It may require re-training the model in certain cases.
'''
@xai_component
class PlotModel(Component):
@xai_component(color="springgreen")
class PlotModelClassification(Component):
in_model:InArg[any] #Trained model object
plot_type:InArg[str] #plot name
list_available_plots:InArg[bool] # list the available plots

out_model:OutArg[any]

Expand All @@ -325,20 +247,34 @@ def __init__(self):
self.done = False
self.in_model = InArg(None)
self.plot_type = InArg('auc')
self.list_available_plots=InArg(False)

self.out_model= OutArg(None)

def execute(self, ctx) -> None:

from pycaret.classification import plot_model


plot = {'auc' : 'Area Under the Curve','threshold' : 'Discrimination Threshold','pr' : 'Precision Recall Curve',
'confusion_matrix' : 'Confusion Matrix','error' : 'Class Prediction Error','class_report' : 'Classification Report',
'boundary' : 'Decision Boundary','rfe' : 'Recursive Feature Selection','learning' : 'Learning Curve',
'manifold' : 'Manifold Learning','calibration' : 'Calibration Curve','vc' : 'Validation Curve',
'dimension' : 'Dimension Learning','feature' : 'Feature Importance','feature_all' : 'Feature Importance (All)',
'parameter' : 'Model Hyperparameter','lift' : 'Lift Curve','gain' : 'Gain Chart','tree' : 'Decision Tree','ks' : 'KS Statistic Plot'}

in_model = self.in_model.value
plot_type = self.plot_type.value
list_available_plots = self.list_available_plots.value

with capture.capture_output() as captured:
plot_model = plot_model(in_model, plot = plot_type)
captured.show()

if list_available_plots is True:
print('List of available plots (plot Type - Plot Name):')
for key, value in plot.items():
print(key, ' - ', value)

self.out_model.value = in_model

self.done = True
Expand All @@ -347,8 +283,8 @@ def execute(self, ctx) -> None:
'''
This component trains a given estimator on the entire dataset including the holdout set.
'''
@xai_component
class FinalizeModel(Component):
@xai_component(color='crimson')
class FinalizeModelClassification(Component):
in_model:InArg[any] #Trained model object

out_finalize_model:OutArg[any] ##Trained model object
Expand Down Expand Up @@ -379,8 +315,8 @@ def execute(self, ctx) -> None:
This component predicts Label and Score (probability of predicted class) using a trained model.
When data is None, it predicts label and score on the holdout set
'''
@xai_component
class PredictModel(Component):
@xai_component(color='darkviolet')
class PredictModelClassification(Component):
in_model:InArg[any] #Trained model object
predict_dataset:InArg[any] #Shape (n_samples, n_features). All features used during training must be available in the unseen dataset.

Expand Down Expand Up @@ -414,8 +350,8 @@ def execute(self, ctx) -> None:
This component saves the transformation pipeline and trained model object into the
current working directory as a pickle file for later use.
'''
@xai_component
class SaveModel(Component):
@xai_component(color='red')
class SaveModelClassification(Component):
in_model:InArg[any] #Trained model object
save_path:InArg[str] #Name and saving path of the model.
model_only:InArg[bool] #When set to True, only trained model object is saved instead of the entire pipeline.
Expand Down Expand Up @@ -443,8 +379,8 @@ def execute(self, ctx) -> None:
'''
This component loads a previously saved pipeline.
'''
@xai_component
class LoadModel(Component):
@xai_component(color='red')
class LoadModelClassification(Component):
model_path:InArg[str] #Name and path of the saved model

model:OutArg[any] #Trained model object
Expand Down Expand Up @@ -472,8 +408,8 @@ def execute(self, ctx) -> None:
'''
This component ensembles a given estimator. The output of this function is a score grid with CV scores by fold.
'''
@xai_component
class EnsembleModel(Component):
@xai_component(color='gold')
class EnsembleModelClassification(Component):
in_model:InArg[any] #Trained model object
method:InArg[str] #Method for ensembling base estimator. It can be ‘Bagging’ or ‘Boosting’.
choose_better:InArg[bool] #When set to True, the returned object is always better performing. The metric used for comparison is defined by the optimize parameter.
Expand Down Expand Up @@ -515,8 +451,8 @@ def execute(self, ctx) -> None:
'''
This component trains a Soft Voting / Majority Rule classifier for select models passed in the top_model list.
'''
@xai_component
class BlendModels(Component):
@xai_component(color='greenyellow')
class BlendModelsClassification(Component):
top_models:InArg[any] #List of trained model objects from CompareModel component
model_1:InArg[any] # first model to blend
model_2:InArg[any] # second model to blend
Expand Down Expand Up @@ -571,8 +507,8 @@ def execute(self, ctx) -> None:
This component trains a meta model over select estimators passed in the estimator_list parameter.
The output of this function is a score grid with CV scores by fold
'''
@xai_component
class StackModels(Component):
@xai_component(color='lawngreen')
class StackModelsClassification(Component):
top_models:InArg[any] #List of trained model objects from CompareModel component
model_1:InArg[any] # first model to stack
model_2:InArg[any] # first model to stack
Expand Down Expand Up @@ -630,8 +566,8 @@ def execute(self, ctx) -> None:
This component calibrates the probability of a given estimator using isotonic or logistic regression.
The output of this function is a score grid with CV scores by fold.
'''
@xai_component
class CalibrateModel(Component):
@xai_component(color='steelblue')
class CalibrateModelClassification(Component):
in_model:InArg[any] #Trained model object
method:InArg[str] #The method to use for calibration. Can be ‘sigmoid’ which corresponds to Platt’s method or ‘isotonic’ which is a non-parametric approach.
calibrate_fold:InArg[int] #Controls internal cross-validation. Can be an integer or a scikit-learn CV generator.
Expand Down Expand Up @@ -664,29 +600,13 @@ def execute(self, ctx) -> None:

self.done = True

'''
Logging all the trained models to MLflow, can access at localhost:5000
'''
@xai_component
class Logging(Component):

def __init__(self):

self.done = False

def execute(self, ctx) -> None:
import subprocess
print("You can access the logs at localhost:5000")
subprocess.run("mlflow ui")

self.done = True

'''
This component returns the best model out of all trained models in current session based on the optimize parameter.
Metrics evaluated can be accessed using the get_metrics function.
'''
@xai_component
class AutoML(Component):
class AutoMLClassification(Component):
optimize:InArg[str] #Metric to use for model selection. It also accepts custom metrics added using the add_metric function.

best_model:OutArg[any] # best Trained Model object
Expand Down
Loading

0 comments on commit 865911c

Please sign in to comment.