Enable EnsembleClassifier and EnsembleRegressor (#207)

* Enable EnsembleClassifier * nit * Enable EnsembleRegressor * Add output combiners * Add sub model selectors * Update examples * Add documentation for components * Add diversity measure * Improve examples * Add tests * Fix test_estimator_checks
microsoft · Aug 6, 2019 · e257cf3 · e257cf3
1 parent 0458160
commit e257cf3
Show file tree

Hide file tree

Showing 82 changed files with 4,853 additions and 21 deletions.
diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs
@@ -314,7 +314,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd
             env.ComponentCatalog.RegisterAssembly(typeof(CategoricalCatalog).Assembly); // ML.Transforms
             env.ComponentCatalog.RegisterAssembly(typeof(FastTreeRegressionTrainer).Assembly); // ML.FastTree
 
-            //env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble
+            env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble
             env.ComponentCatalog.RegisterAssembly(typeof(KMeansModelParameters).Assembly); // ML.KMeansClustering
             env.ComponentCatalog.RegisterAssembly(typeof(PcaModelParameters).Assembly); // ML.PCA
             env.ComponentCatalog.RegisterAssembly(typeof(CVSplit).Assembly); // ML.EntryPoints

diff --git a/src/python/docs/docstrings/ClassifierBestPerformanceSelector.txt b/src/python/docs/docstrings/ClassifierBestPerformanceSelector.txt
@@ -0,0 +1,38 @@
+    """
+
+	**Description**
+    Combines only the models with the best performance.
+
+
+    :param metric_name: the metric type to be used to find the weights for
+        each model. Can be ``"AccuracyMicro"``, ``"AccuracyMacro"``,
+		``"LogLoss"``, or ``"LogLossReduction"``.
+
+
+    .. seealso::
+	    :py:class:`EnsembleClassifier
+        <nimbusml.ensemble.EnsembleClassifier>`
+
+		* Submodel selectors:
+		:py:class:`ClassifierAllSelector
+        <nimbusml.ensemble.sub_model_selector.ClassifierAllSelector>`,
+		:py:class:`ClassifierBestDiverseSelector
+        <nimbusml.ensemble.sub_model_selector.ClassifierBestDiverseSelector>`
+
+		* Output combiners:
+		:py:class:`ClassifierAverage
+        <nimbusml.ensemble.output_combiner.ClassifierAverage>`,
+		:py:class:`ClassifierMedian
+        <nimbusml.ensemble.output_combiner.ClassifierMedian>`,
+		:py:class:`ClassifierStacking
+        <nimbusml.ensemble.output_combiner.ClassifierStacking>`,
+		:py:class:`ClassifierVoting
+        <nimbusml.ensemble.output_combiner.ClassifierVoting>`
+
+
+    .. index:: models, ensemble, classification
+
+    Example:
+       .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py
+              :language: python
+    """
diff --git a/src/python/docs/docstrings/ClassifierWeightedAverage.txt b/src/python/docs/docstrings/ClassifierWeightedAverage.txt
@@ -0,0 +1,61 @@
+    """
+
+	**Description**
+    Computes the weighted average of the outputs of the trained models
+
+
+    :param weightage_name: the metric type to be used to find the weights for
+        each model. Can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``.
+
+	:param normalize: Specifies the type of automatic normalization used:
+
+        * ``"Auto"``: if normalization is needed, it is performed
+          automatically. This is the default choice.
+        * ``"No"``: no normalization is performed.
+        * ``"Yes"``: normalization is performed.
+        * ``"Warn"``: if normalization is needed, a warning
+          message is displayed, but normalization is not performed.
+
+        Normalization rescales disparate data ranges to a standard scale.
+        Feature
+        scaling ensures the distances between data points are proportional
+        and
+        enables various optimization methods such as gradient descent to
+        converge
+        much faster. If normalization is performed, a ``MinMax`` normalizer
+        is
+        used. It normalizes values in an interval [a, b] where ``-1 <= a <=
+        0``
+        and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
+        sparsity by mapping zero to zero.
+
+
+    .. seealso::
+	    :py:class:`EnsembleClassifier
+        <nimbusml.ensemble.EnsembleClassifier>`
+
+		* Submodel selectors:
+		:py:class:`ClassifierAllSelector
+        <nimbusml.ensemble.sub_model_selector.ClassifierAllSelector>`,
+		:py:class:`ClassifierBestDiverseSelector
+        <nimbusml.ensemble.sub_model_selector.ClassifierBestDiverseSelector>`,
+		:py:class:`ClassifierBestPerformanceSelector
+        <nimbusml.ensemble.sub_model_selector.ClassifierBestPerformanceSelector>`
+
+		* Output combiners:
+		:py:class:`ClassifierAverage
+        <nimbusml.ensemble.output_combiner.ClassifierAverage>`,
+		:py:class:`ClassifierMedian
+        <nimbusml.ensemble.output_combiner.ClassifierMedian>`,
+		:py:class:`ClassifierStacking
+        <nimbusml.ensemble.output_combiner.ClassifierStacking>`,
+		:py:class:`ClassifierVoting
+        <nimbusml.ensemble.output_combiner.ClassifierVoting>`
+
+
+    .. index:: models, ensemble, classification
+
+    Example:
+       .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py
+              :language: python
+    """
diff --git a/src/python/docs/docstrings/EnsembleClassifier.txt b/src/python/docs/docstrings/EnsembleClassifier.txt
@@ -0,0 +1,144 @@
+    """
+
+	**Description**
+    Train a multi class ensemble model
+
+    .. remarks::
+        An Ensemble is a set of models, each trained on a sample of the
+		training set. Training an ensemble instead of a single model can boost
+		the accuracy of a given algorithm.
+
+		The quality of an Ensemble depends on two factors; Accuracy and
+		Diversity. Ensemble can be analogous to Teamwork. If every team member
+		is diverse and competent, then the team can perform very well. Here a
+		team member is a base learner and the team is the Ensemble. In the case
+		of classification ensembles, the base learner is a
+		``LogisticRegressionClassifier``.
+
+
+    :param sampling_type: Specifies how the training samples are created:
+
+		* ``BootstrapSelector``: takes a bootstrap sample of the training set
+		  (sampling with replacement). This is the default method.
+	    * ``RandomPartitionSelector``: randomly partitions the training set
+		  into subsets. 
+		* ``AllSelector``: every model is trained using the whole training set.
+
+		Each of these Subset Selectors has two options for selecting features:
+		* ``AllFeatureSelector``: selects all the features. This is the default
+		  method. 
+		* ``RandomFeatureSelector``: selects a random subset of the features
+		  for each model.
+
+	:param num_models: indicates the number models to train, i.e. the number of
+	    subsets of the training set to sample. The default value is 50. If
+		batches are used then this indicates the number of models per batch.
+
+	:param sub_model_selector_type: Determines the efficient set of models the
+	``output_combiner`` uses, and removes the least significant models. This is
+	used to improve the accuracy and reduce the model size. This is also called
+	pruning.
+
+	    * ``ClassifierAllSelector``: does not perform any pruning and selects
+	      all models in the ensemble to combine to create the output. This is
+		  the default submodel selector.
+		* ``ClassifierBestDiverseSelector``: combines models whose predictions
+		  are as diverse as possible. Currently, only diagreement diversity is
+		  supported.
+		* ``ClassifierBestPerformanceSelector``: combines only the models with
+		  the best performance according some metric. The metric can be
+		  ``"AccuracyMicro"``, ``"AccuracyMacro"``,	``"LogLoss"``,
+		  or ``"LogLossReduction"``.
+
+
+	:output_combiner: indicates how to combine the predictions of the different
+	    models into a single prediction. There are five available output
+		combiners for clasification:
+
+		* ``ClassifierAverage``: computes the average of the scores produced by
+		  the trained models.
+		* ``ClassifierMedian``: computes the median of the scores produced by
+		  the trained models.
+		* ``ClassifierStacking``: computes the output by training a model on a
+		  training set where each instance is a vector containing the outputs
+		  of the different models on a training instance, and the instance's
+		  label.
+		* ``ClassifierVoting``: computes the fraction of positive predictions
+		  for each class from all the trained models, and outputs the class
+		  with the largest number.
+		* ``ClassifierWeightedAverage``: computes the weighted average of the
+		outputs of the trained models, weighted by the specified metric. The
+		  metric can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``.
+
+	:param normalize: Specifies the type of automatic normalization used:
+
+        * ``"Auto"``: if normalization is needed, it is performed
+          automatically. This is the default choice.
+        * ``"No"``: no normalization is performed.
+        * ``"Yes"``: normalization is performed.
+        * ``"Warn"``: if normalization is needed, a warning
+          message is displayed, but normalization is not performed.
+
+        Normalization rescales disparate data ranges to a standard scale.
+        Feature
+        scaling ensures the distances between data points are proportional
+        and
+        enables various optimization methods such as gradient descent to
+        converge
+        much faster. If normalization is performed, a ``MinMax`` normalizer
+        is
+        used. It normalizes values in an interval [a, b] where ``-1 <= a <=
+        0``
+        and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
+        sparsity by mapping zero to zero.
+
+	:param batch_size: train the models iteratively on subsets of the training
+	    set of this size. When using this option, it is assumed that the
+		training set is randomized enough so that every batch is a random
+		sample of instances. The default value is -1, indicating using the
+		whole training set. If the value is changed to an integer greater than
+		0, the number of trained models is the number of batches (the size of
+		the training set divided by the batch size), times ``num_models``.
+
+    .. seealso::
+	    * Subset selectors:
+        :py:class:`AllInstanceSelector
+        <nimbusml.ensemble.subset_selector.AllInstanceSelector>`,
+		:py:class:`BootstrapSelector
+        <nimbusml.ensemble.subset_selector.BootstrapSelector>`,
+		:py:class:`RandomPartitionSelector
+		<nimbusml.ensemble.subset_selector.RandomPartitionSelector>`
+
+		* Feature selectors:
+		:py:class:`AllFeatureSelector
+        <nimbusml.ensemble.feature_selector.AllFeatureSelector>`,
+		:py:class:`RandomFeatureSelector
+        <nimbusml.ensemble.feature_selector.RandomFeatureSelector>`
+
+		* Submodel selectors:
+		:py:class:`ClassifierAllSelector
+        <nimbusml.ensemble.sub_model_selector.ClassifierAllSelector>`,
+		:py:class:`ClassifierBestDiverseSelector
+        <nimbusml.ensemble.sub_model_selector.ClassifierBestDiverseSelector>`,
+		:py:class:`ClassifierBestPerformanceSelector
+        <nimbusml.ensemble.sub_model_selector.ClassifierBestPerformanceSelector>`
+
+		* Output combiners:
+		:py:class:`ClassifierAverage
+        <nimbusml.ensemble.output_combiner.ClassifierAverage>`,
+		:py:class:`ClassifierMedian
+        <nimbusml.ensemble.output_combiner.ClassifierMedian>`,
+		:py:class:`ClassifierStacking
+        <nimbusml.ensemble.output_combiner.ClassifierStacking>`,
+		:py:class:`ClassifierVoting
+        <nimbusml.ensemble.output_combiner.ClassifierVoting>`,
+		:py:class:`ClassifierWeightedAverage
+        <nimbusml.ensemble.output_combiner.ClassifierWeightedAverage>`
+
+
+    .. index:: models, ensemble, classification
+
+    Example:
+       .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py
+              :language: python
+    """
diff --git a/src/python/docs/docstrings/EnsembleRegressor.txt b/src/python/docs/docstrings/EnsembleRegressor.txt
@@ -0,0 +1,134 @@
+    """
+
+	**Description**
+    Train a regression ensemble model
+
+    .. remarks::
+        An Ensemble is a set of models, each trained on a sample of the
+		training set. Training an ensemble instead of a single model can boost
+		the accuracy of a given algorithm.
+
+		The quality of an Ensemble depends on two factors; Accuracy and
+		Diversity. Ensemble can be analogous to Teamwork. If every team member
+		is diverse and competent, then the team can perform very well. Here a
+		team member is a base learner and the team is the Ensemble. In the case
+		of regression ensembles, the base learner is an
+		``OnlineGradientDescentRegressor``.
+
+
+    :param sampling_type: Specifies how the training samples are created:
+
+		* ``BootstrapSelector``: takes a bootstrap sample of the training set
+		  (sampling with replacement). This is the default method.
+	    * ``RandomPartitionSelector``: randomly partitions the training set
+		  into subsets. 
+		* ``AllSelector``: every model is trained using the whole training set.
+
+		Each of these Subset Selectors has two options for selecting features:
+		* ``AllFeatureSelector``: selects all the features. This is the default
+		  method. 
+		* ``RandomFeatureSelector``: selects a random subset of the features
+		  for each model.
+
+	:param num_models: indicates the number models to train, i.e. the number of
+	    subsets of the training set to sample. The default value is 50. If
+		batches are used then this indicates the number of models per batch.
+
+	:param sub_model_selector_type: Determines the efficient set of models the
+	``output_combiner`` uses, and removes the least significant models. This is
+	used to improve the accuracy and reduce the model size. This is also called
+	pruning.
+
+	    * ``RegressorAllSelector``: does not perform any pruning and selects
+	      all models in the ensemble to combine to create the output. This is
+		  the default submodel selector.
+		* ``RegressorBestDiverseSelector``: combines models whose predictions
+		  are as diverse as possible. Currently, only diagreement diversity is
+		  supported.
+		* ``RegressorBestPerformanceSelector``: combines only the models with
+		  the best performance according to the specified metric. The metric
+		  can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or
+		  ``"RSquared"``.
+
+
+	:output_combiner: indicates how to combine the predictions of the different
+	    models into a single prediction. There are five available output
+		combiners for clasification:
+
+		* ``RegressorAverage``: computes the average of the scores produced by
+		  the trained models.
+		* ``RegressorMedian``: computes the median of the scores produced by
+		  the trained models.
+		* ``RegressorStacking``: computes the output by training a model on a
+		  training set where each instance is a vector containing the outputs
+		  of the different models on a training instance, and the instance's
+		  label.
+
+	:param normalize: Specifies the type of automatic normalization used:
+
+        * ``"Auto"``: if normalization is needed, it is performed
+          automatically. This is the default choice.
+        * ``"No"``: no normalization is performed.
+        * ``"Yes"``: normalization is performed.
+        * ``"Warn"``: if normalization is needed, a warning
+          message is displayed, but normalization is not performed.
+
+        Normalization rescales disparate data ranges to a standard scale.
+        Feature
+        scaling ensures the distances between data points are proportional
+        and
+        enables various optimization methods such as gradient descent to
+        converge
+        much faster. If normalization is performed, a ``MinMax`` normalizer
+        is
+        used. It normalizes values in an interval [a, b] where ``-1 <= a <=
+        0``
+        and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves
+        sparsity by mapping zero to zero.
+
+	:param batch_size: train the models iteratively on subsets of the training
+	    set of this size. When using this option, it is assumed that the
+		training set is randomized enough so that every batch is a random
+		sample of instances. The default value is -1, indicating using the
+		whole training set. If the value is changed to an integer greater than
+		0, the number of trained models is the number of batches (the size of
+		the training set divided by the batch size), times ``num_models``.
+
+    .. seealso::
+	    * Subset selectors:
+        :py:class:`AllInstanceSelector
+        <nimbusml.ensemble.subset_selector.AllInstanceSelector>`,
+		:py:class:`BootstrapSelector
+        <nimbusml.ensemble.subset_selector.BootstrapSelector>`,
+		:py:class:`RandomPartitionSelector
+		<nimbusml.ensemble.subset_selector.RandomPartitionSelector>`
+
+		* Feature selectors:
+		:py:class:`AllFeatureSelector
+        <nimbusml.ensemble.feature_selector.AllFeatureSelector>`,
+		:py:class:`RandomFeatureSelector
+        <nimbusml.ensemble.feature_selector.RandomFeatureSelector>`
+
+		* Submodel selectors:
+		:py:class:`RegressorAllSelector
+        <nimbusml.ensemble.sub_model_selector.RegressorAllSelector>`,
+		:py:class:`RegressorBestDiverseSelector
+        <nimbusml.ensemble.sub_model_selector.RegressorBestDiverseSelector>`,
+		:py:class:`RegressorBestPerformanceSelector
+        <nimbusml.ensemble.sub_model_selector.RegressorBestPerformanceSelector>`
+
+		* Output combiners:
+		:py:class:`RegressorAverage
+        <nimbusml.ensemble.output_combiner.RegressorAverage>`,
+		:py:class:`RegressorMedian
+        <nimbusml.ensemble.output_combiner.RegressorMedian>`,
+		:py:class:`RegressorStacking
+        <nimbusml.ensemble.output_combiner.RegressorStacking>`
+
+
+    .. index:: models, ensemble, classification
+
+    Example:
+       .. literalinclude:: /../nimbusml/examples/EnsembleRegressor.py
+              :language: python
+    """