Merge pull request #138 from XpressAI/pycaret_regression_xai

📉 Pycaret Regression Components + Examples
XpressAI · Apr 5, 2022 · 865911c · 865911c
2 parents 585ed17 + 4b6f159
commit 865911c
Show file tree

Hide file tree

Showing 10 changed files with 13,715 additions and 7,159 deletions.
diff --git a/examples/AutoMLBasicBinaryClassification.xircuits b/examples/AutoMLBasicBinaryClassification.xircuits
diff --git a/examples/AutoMLBasicMulticlassClassification.xircuits b/examples/AutoMLBasicMulticlassClassification.xircuits
diff --git a/examples/AutoMLBasicRegression.xircuits b/examples/AutoMLBasicRegression.xircuits
diff --git a/examples/AutoMLClassificationBlendModels.xircuits b/examples/AutoMLClassificationBlendModels.xircuits
diff --git a/examples/AutoMLRegressionStackModels.xircuits b/examples/AutoMLRegressionStackModels.xircuits
diff --git a/package-lock.json b/package-lock.json
diff --git a/tsconfig.tsbuildinfo b/tsconfig.tsbuildinfo
diff --git a/xai_components/xai_pycaret/classification.py b/xai_components/xai_pycaret/classification.py
@@ -2,100 +2,14 @@
 from IPython.utils import capture
 
 
-"""
-This component loads sample datasets from git repository.
- List of available datasets can be checked using get_data('index')
-"""
-@xai_component
-class GetData(Component):
-    dataset: InArg[str]  #Index value of dataset.
-    save_copy: InArg[bool] #When set to true, it saves a copy in current working directory.
-    verbose: InArg[bool]  #When set to False, head of data is not displayed.
-
-    out_dataset : OutArg[any] #Dataset
-
-    def __init__(self):
-
-        self.done = False
-        self.dataset = InArg(None)
-        self.save_copy = InArg(False)
-        self.verbose = InArg(True)
-
-        self.out_dataset = OutArg(None)
-
-    def execute(self, ctx) -> None:
-
-        from pycaret.datasets import get_data
-
-        dataset = self.dataset.value
-        save_copy = self.save_copy.value
-        verbose = self.verbose.value
-
-        if dataset is None:
-            dataset = "index"
-            print("Please choose a dataset...")
-
-        load_dataset = get_data(dataset = dataset, save_copy=save_copy, verbose = verbose)
-        print('Dataset shape: ' + str(load_dataset.shape))
-
-        self.out_dataset.value = load_dataset
-
-        self.done = True
-
-
-"""
-This component withheld sample from the original dataset to be used for predictions. 
-This should not be confused with a train/test split as this particular split 
-is performed to simulate a real life scenario.
-"""
-@xai_component
-class SampleTestData(Component):
-    in_dataset: InArg[any] 
-    test_fraction: InArg[float] #Fraction of testing dataset size.
-    seed : InArg[int] #You can use random_state for reproducibility.
-
-    train_val_dataset : OutArg[any] #train/val dataset for training and evaluation
-    test_Dataset: OutArg[any]  #test dataset for model prediction
-
-
-    def __init__(self):
-
-        self.done = False
-        self.in_dataset = InArg(None)
-        self.test_fraction = InArg(0)
-        self.seed = InArg(None)
-
-        self.train_val_dataset = OutArg(None)
-        self.test_Dataset = OutArg(None)
-
-    def execute(self, ctx) -> None:
-
-        in_dataset = self.in_dataset.value
-        test_fraction = self.test_fraction.value
-        seed = self.seed.value
-
-        if seed is None:
-            print("Set the seed value for reproducibility.")
-
-        train_val_dataset = in_dataset.sample(frac=1-test_fraction, random_state=seed)
-        test_Dataset = in_dataset.drop(train_val_dataset.index)
-
-        print('Data for Modeling: ' + str(train_val_dataset.shape))
-        print('Test Data For Predictions: ' + str(test_Dataset.shape))
-
-        self.train_val_dataset.value = train_val_dataset
-        self.test_Dataset.value = test_Dataset
-
-        self.done = True
-
 
 """
 This component initializes the training environment and creates the transformation pipeline.
 Setup component must be called before executing any other component. It takes two mandatory 
 parameters:data and target. All the other parameters are optional.
 """
-@xai_component
-class SetupEnvironment(Component):
+@xai_component(color="blue")
+class SetupClassification(Component):
     in_dataset: InArg[any] #Shape (n_samples, n_features), where n_samples is the number of samples and n_features is the number of features
     target: InArg[str] #Name of the target column to be passed in as a string. The target variable can be either binary or multiclass.
     train_size_fraction : InArg[float] #Proportion of the dataset to be used for training and validation. Should be between 0.0 and 1.0.
@@ -106,6 +20,7 @@ class SetupEnvironment(Component):
     multicollinearity_threshold:InArg[float] #Threshold for correlated features. Ignored when remove_multicollinearity is not True.
     bin_numeric_features:InArg[any] #To convert numeric features into categorical,It takes a list of strings with column names that are related.
     group_features:InArg[any] #When the dataset contains features with related characteristics, group_features parameter can be used for feature extraction. It takes a list of strings with column names that are related.
+    ignore_features:InArg[list] #ignore_features param can be used to ignore features during model training. It takes a list of strings with column names that are to be ignored.
     seed : InArg[int] #You can use random_state for reproducibility.
     log_experiment:InArg[bool] #logging setup and training
     experiment_name:InArg[str] #Name of the experiment for logging.
@@ -124,6 +39,7 @@ def __init__(self):
         self.multicollinearity_threshold = InArg(0.9)
         self.bin_numeric_features = InArg(None)
         self.group_features = InArg(None)
+        self.ignore_features = InArg(None) 
         self.seed = InArg(None)
         self.log_experiment = InArg(False)
         self.experiment_name = InArg('default')
@@ -143,6 +59,7 @@ def execute(self, ctx) -> None:
         multicollinearity_threshold = self.multicollinearity_threshold.value
         bin_numeric_features = self.bin_numeric_features.value
         group_features = self.group_features.value
+        ignore_features = self.ignore_features.value
         seed = self.seed.value
         log_experiment = self.log_experiment.value
         experiment_name = self.experiment_name.value
@@ -162,6 +79,7 @@ def execute(self, ctx) -> None:
              multicollinearity_threshold = multicollinearity_threshold,
              bin_numeric_features = bin_numeric_features,
              group_features = group_features,
+             ignore_features = ignore_features,
              session_id=seed,
              log_experiment = log_experiment,
              experiment_name = experiment_name,
@@ -177,9 +95,10 @@ def execute(self, ctx) -> None:
 in the model library using cross validation.The output of this component is 
 a score grid with average cross validated scores. 
 '''
-@xai_component
-class CompareModels(Component):
+@xai_component(color="firebrick")
+class CompareModelsClassification(Component):
     sort_by:InArg[str] #The sort order of the score grid. 
+    exclude:InArg[list] #To omit certain models from training and evaluation, pass a list containing model id in the exclude parameter.
     num_top:InArg[int] #Number of top_n models to return.
 
     top_models:OutArg[any]
@@ -188,6 +107,7 @@ def __init__(self):
 
         self.done = False
         self.sort_by = InArg('Accuracy')
+        self.exclude = InArg(None)
         self.num_top = InArg(1)
 
         self.top_models = OutArg(None)
@@ -197,10 +117,11 @@ def execute(self, ctx) -> None:
         from pycaret.classification import compare_models 
 
         sort_by = self.sort_by.value
+        exclude = self.exclude.value
         num_top = self.num_top.value
 
         with capture.capture_output() as captured:
-            best_model = compare_models(sort=sort_by,n_select = num_top)
+            best_model = compare_models(sort=sort_by,exclude = exclude,n_select = num_top)
         captured.show()
         print('Best '+str(num_top)+' Model:',best_model)
 
@@ -213,8 +134,8 @@ def execute(self, ctx) -> None:
 using cross validation.The output of this component is a score grid with 
 CV scores by fold.
 '''
-@xai_component
-class CreateModel(Component):
+@xai_component(color="orange")
+class CreateModelClassification(Component):
     model_id:InArg[str] #ID of an estimator available in model library or pass an untrained model object consistent with scikit-learn API
     num_fold:InArg[int] #Controls cross-validation. If None, the CV generator in the fold_strategy parameter of the setup function is used.
 
@@ -246,11 +167,11 @@ def execute(self, ctx) -> None:
 
 
 '''
-This component tunes the hyperparameters of a given estimator. The output of this component is
+This component tunes the hyperparameters of a given model. The output of this component is
 a score grid with CV scores by fold of the best selected model based on optimize parameter.
 '''
-@xai_component
-class TuneModel(Component):
+@xai_component(color="salmon")
+class TuneModelClassification(Component):
     in_model:InArg[any] #Trained model object
     optimize:InArg[str] #Metric name to be evaluated for hyperparameter tuning.
     early_stopping_patience:InArg[int] #Maximum number of epochs to run for each sampled configuration.
@@ -313,10 +234,11 @@ def execute(self, ctx) -> None:
 This component analyzes the performance of a trained model on holdout set. 
 It may require re-training the model in certain cases.
 '''
-@xai_component
-class PlotModel(Component):
+@xai_component(color="springgreen")
+class PlotModelClassification(Component):
     in_model:InArg[any] #Trained model object
     plot_type:InArg[str] #plot name
+    list_available_plots:InArg[bool] # list the available plots
 
     out_model:OutArg[any]
 
@@ -325,20 +247,34 @@ def __init__(self):
         self.done = False
         self.in_model = InArg(None)
         self.plot_type = InArg('auc')
+        self.list_available_plots=InArg(False)
 
         self.out_model= OutArg(None)
 
     def execute(self, ctx) -> None:
 
         from pycaret.classification import plot_model 
-
+
+        plot = {'auc' : 'Area Under the Curve','threshold' : 'Discrimination Threshold','pr' : 'Precision Recall Curve',
+            'confusion_matrix' : 'Confusion Matrix','error' : 'Class Prediction Error','class_report' : 'Classification Report',
+            'boundary' : 'Decision Boundary','rfe' : 'Recursive Feature Selection','learning' : 'Learning Curve',
+            'manifold' : 'Manifold Learning','calibration' : 'Calibration Curve','vc' : 'Validation Curve',
+            'dimension' : 'Dimension Learning','feature' : 'Feature Importance','feature_all' : 'Feature Importance (All)',
+            'parameter' : 'Model Hyperparameter','lift' : 'Lift Curve','gain' : 'Gain Chart','tree' : 'Decision Tree','ks' : 'KS Statistic Plot'}
+
         in_model = self.in_model.value
         plot_type = self.plot_type.value
+        list_available_plots = self.list_available_plots.value
 
         with capture.capture_output() as captured:
             plot_model = plot_model(in_model, plot = plot_type)
         captured.show()
 
+        if list_available_plots is True:
+            print('List of available plots (plot Type - Plot Name):')
+            for key, value in plot.items():
+                print(key, ' - ', value)
+
         self.out_model.value = in_model
 
         self.done = True
@@ -347,8 +283,8 @@ def execute(self, ctx) -> None:
 '''
 This component trains a given estimator on the entire dataset including the holdout set.
 '''
-@xai_component
-class FinalizeModel(Component):
+@xai_component(color='crimson')
+class FinalizeModelClassification(Component):
     in_model:InArg[any] #Trained model object
 
     out_finalize_model:OutArg[any] ##Trained model object
@@ -379,8 +315,8 @@ def execute(self, ctx) -> None:
 This component predicts Label and Score (probability of predicted class) using a trained model.
  When data is None, it predicts label and score on the holdout set
 '''
-@xai_component
-class PredictModel(Component):
+@xai_component(color='darkviolet')
+class PredictModelClassification(Component):
     in_model:InArg[any] #Trained model object
     predict_dataset:InArg[any] #Shape (n_samples, n_features). All features used during training must be available in the unseen dataset.
 
@@ -414,8 +350,8 @@ def execute(self, ctx) -> None:
 This component saves the transformation pipeline and trained model object into the
  current working directory as a pickle file for later use.
 '''
-@xai_component
-class SaveModel(Component):
+@xai_component(color='red')
+class SaveModelClassification(Component):
     in_model:InArg[any] #Trained model object
     save_path:InArg[str] #Name and saving path of the model.
     model_only:InArg[bool] #When set to True, only trained model object is saved instead of the entire pipeline.
@@ -443,8 +379,8 @@ def execute(self, ctx) -> None:
 '''
 This component loads a previously saved pipeline.
 '''
-@xai_component
-class LoadModel(Component):
+@xai_component(color='red')
+class LoadModelClassification(Component):
     model_path:InArg[str] #Name and path of the saved model
 
     model:OutArg[any] #Trained model object
@@ -472,8 +408,8 @@ def execute(self, ctx) -> None:
 '''
 This component ensembles a given estimator. The output of this function is a score grid with CV scores by fold.
 '''
-@xai_component
-class EnsembleModel(Component):
+@xai_component(color='gold')
+class EnsembleModelClassification(Component):
     in_model:InArg[any] #Trained model object
     method:InArg[str] #Method for ensembling base estimator. It can be ‘Bagging’ or ‘Boosting’.
     choose_better:InArg[bool] #When set to True, the returned object is always better performing. The metric used for comparison is defined by the optimize parameter.
@@ -515,8 +451,8 @@ def execute(self, ctx) -> None:
 '''
 This component trains a Soft Voting / Majority Rule classifier for select models passed in the top_model list. 
 '''
-@xai_component
-class BlendModels(Component):
+@xai_component(color='greenyellow')
+class BlendModelsClassification(Component):
     top_models:InArg[any] #List of trained model objects from CompareModel component
     model_1:InArg[any] # first model to blend 
     model_2:InArg[any] # second model to blend 
@@ -571,8 +507,8 @@ def execute(self, ctx) -> None:
 This component trains a meta model over select estimators passed in the estimator_list parameter.
  The output of this function is a score grid with CV scores by fold
 '''
-@xai_component
-class StackModels(Component):
+@xai_component(color='lawngreen')
+class StackModelsClassification(Component):
     top_models:InArg[any] #List of trained model objects from CompareModel component
     model_1:InArg[any] # first model to stack
     model_2:InArg[any] # first model to stack
@@ -630,8 +566,8 @@ def execute(self, ctx) -> None:
 This component calibrates the probability of a given estimator using isotonic or logistic regression.
  The output of this function is a score grid with CV scores by fold. 
 '''
-@xai_component
-class CalibrateModel(Component):
+@xai_component(color='steelblue')
+class CalibrateModelClassification(Component):
     in_model:InArg[any] #Trained model object
     method:InArg[str] #The method to use for calibration. Can be ‘sigmoid’ which corresponds to Platt’s method or ‘isotonic’ which is a non-parametric approach.
     calibrate_fold:InArg[int] #Controls internal cross-validation. Can be an integer or a scikit-learn CV generator.
@@ -664,29 +600,13 @@ def execute(self, ctx) -> None:
 
         self.done = True
 
-'''
-Logging all the trained models to MLflow, can access at localhost:5000
-'''
-@xai_component
-class Logging(Component):
-
-    def __init__(self):
-
-        self.done = False
-
-    def execute(self, ctx) -> None:
-        import subprocess
-        print("You can access the logs at localhost:5000")
-        subprocess.run("mlflow ui")
-
-        self.done = True
 
 '''
 This component returns the best model out of all trained models in current session based on the optimize parameter. 
 Metrics evaluated can be accessed using the get_metrics function.
 '''
 @xai_component
-class AutoML(Component):
+class AutoMLClassification(Component):
     optimize:InArg[str] #Metric to use for model selection. It also accepts custom metrics added using the add_metric function.
 
     best_model:OutArg[any] # best Trained Model object