Feature: weighting for imbalanced classes

automl · May 2, 2015 · 6de26d7 · 6de26d7
1 parent a942472
commit 6de26d7
Show file tree

Hide file tree

Showing 26 changed files with 341 additions and 63 deletions.
diff --git a/ParamSklearn/base.py b/ParamSklearn/base.py
@@ -88,8 +88,8 @@ def fit(self, X, Y, fit_params=None, init_params=None):
 
         # seperate the init parameters for the single methods
         init_params_per_method = defaultdict(dict)
-        if init_params is not None:
-            for init_param, value in init_params:
+        if init_params is not None and len(init_params) != 0:
+            for init_param, value in init_params.items():
                 method, param = init_param.split(":")
                 init_params_per_method[method][param] = value
 

diff --git a/ParamSklearn/classification.py b/ParamSklearn/classification.py
@@ -11,6 +11,7 @@
 from ParamSklearn import components as components
 from ParamSklearn.base import ParamSklearnBaseEstimator
 from ParamSklearn.util import SPARSE
+from ParamSklearn.components.preprocessing.balancing import Balancing
 import ParamSklearn.create_searchspace_util
 
 
@@ -61,9 +62,19 @@ class ParamSklearnClassifier(ClassifierMixin, ParamSklearnBaseEstimator):
     """
 
     def fit(self, X, Y, fit_params=None, init_params=None):
+        self.num_targets = 1 if len(Y.shape) == 1 else Y.shape[1]
+
+        # Weighting samples has to be done here, not in the components
+        if self.configuration['balancing:strategy'].value == 'weighting':
+            balancing = Balancing(strategy='weighting')
+            init_params, fit_params = balancing.get_weights(
+                Y, self.configuration['classifier'].value,
+                self.configuration['preprocessor'].value,
+                init_params, fit_params)
+
         super(ParamSklearnClassifier, self).fit(X, Y, fit_params=fit_params,
                                                 init_params=init_params)
-        self.num_targets = 1 if len(Y.shape) == 1 else Y.shape[1]
+
         return self
 
     def predict_proba(self, X, batch_size=None):
@@ -415,4 +426,5 @@ def _get_estimator_components():
 
     @staticmethod
     def _get_pipeline():
-        return ["imputation", "rescaling", "__preprocessor__", "__estimator__"]
+        return ["imputation", "rescaling", "balancing", "__preprocessor__",
+                "__estimator__"]
diff --git a/ParamSklearn/components/classification/adaboost.py b/ParamSklearn/components/classification/adaboost.py
@@ -25,7 +25,7 @@ def __init__(self, n_estimators, learning_rate, algorithm='SAMME.R',
 
         self.estimator = None
 
-    def fit(self, X, Y):
+    def fit(self, X, Y, sample_weight=None):
         base_estimator = sklearn.tree.DecisionTreeClassifier(max_depth=self.max_depth)
 
         self.estimator = sklearn.ensemble.AdaBoostClassifier(
@@ -34,9 +34,8 @@ def fit(self, X, Y):
             learning_rate=self.learning_rate,
             algorithm=self.algorithm,
             random_state=self.random_state
-
         )
-        self.estimator.fit(X, Y)
+        self.estimator.fit(X, Y, sample_weight=sample_weight)
         return self
 
     def predict(self, X):

diff --git a/ParamSklearn/components/classification/decision_tree.py b/ParamSklearn/components/classification/decision_tree.py
@@ -35,15 +35,15 @@ def __init__(self, criterion, max_features, max_depth,
         self.random_state = random_state
         self.estimator = None
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         self.estimator = DecisionTreeClassifier(
             criterion=self.criterion,
             max_depth=self.max_depth,
             min_samples_split=self.min_samples_split,
             min_samples_leaf=self.min_samples_leaf,
             max_leaf_nodes=self.max_leaf_nodes,
             random_state=self.random_state)
-        self.estimator.fit(X, y)
+        self.estimator.fit(X, y, sample_weight=sample_weight)
         return self
 
     def predict(self, X):

diff --git a/ParamSklearn/components/classification/extra_trees.py b/ParamSklearn/components/classification/extra_trees.py
@@ -59,7 +59,7 @@ def __init__(self, n_estimators, criterion, min_samples_leaf,
         self.verbose = int(verbose)
         self.estimator = None
 
-    def fit(self, X, Y):
+    def fit(self, X, Y, sample_weight=None):
         num_features = X.shape[1]
         max_features = int(float(self.max_features) * (np.log(num_features) + 1))
         # Use at most half of the features
@@ -78,7 +78,7 @@ def fit(self, X, Y):
         while len(self.estimator.estimators_) < self.n_estimators:
             tmp = self.estimator # TODO copy ?
             tmp.n_estimators += self.estimator_increment
-            tmp.fit(X, Y)
+            tmp.fit(X, Y, sample_weight=sample_weight)
             self.estimator = tmp
         return self
 

diff --git a/...rn/components/classification/liblinear.py → ...omponents/classification/liblinear_svc.py b/...rn/components/classification/liblinear.py → ...omponents/classification/liblinear_svc.py
diff --git a/ParamSklearn/components/classification/passive_aggresive.py b/ParamSklearn/components/classification/passive_aggresive.py
@@ -4,7 +4,6 @@
 from HPOlibConfigSpace.hyperparameters import UniformFloatHyperparameter, \
     CategoricalHyperparameter, UnParametrizedHyperparameter, \
     UniformIntegerHyperparameter
-from HPOlibConfigSpace.conditions import EqualsCondition
 
 from ParamSklearn.components.classification_base import \
     ParamSklearnClassificationAlgorithm

diff --git a/ParamSklearn/components/classification/random_forest.py b/ParamSklearn/components/classification/random_forest.py
@@ -28,7 +28,7 @@ def __init__(self, n_estimators, criterion, max_features,
         self.n_jobs = n_jobs
         self.estimator = None
 
-    def fit(self, X, Y):
+    def fit(self, X, Y, sample_weight=None):
         self.n_estimators = int(self.n_estimators)
 
         if self.max_depth == "None":
@@ -67,7 +67,7 @@ def fit(self, X, Y):
         while len(self.estimator.estimators_) < self.n_estimators:
             tmp = self.estimator # TODO I think we need to copy here!
             tmp.n_estimators += self.estimator_increment
-            tmp.fit(X, Y)
+            tmp.fit(X, Y, sample_weight=sample_weight)
             self.estimator = tmp
         return self
 

diff --git a/ParamSklearn/components/classification/ridge.py b/ParamSklearn/components/classification/ridge.py
@@ -13,17 +13,20 @@
 
 
 class Ridge(ParamSklearnClassificationAlgorithm):
-    def __init__(self, alpha, fit_intercept, tol, random_state=None):
+    def __init__(self, alpha, fit_intercept, tol, class_weight=None,
+        random_state=None):
         self.alpha = float(alpha)
         self.fit_intercept = bool(fit_intercept)
         self.tol = float(tol)
+        self.class_weight = class_weight
         self.random_state = random_state
         self.estimator = None
 
     def fit(self, X, Y):
         self.estimator = RidgeClassifier(alpha=self.alpha,
                                         fit_intercept=self.fit_intercept,
-                                        tol=self.tol)
+                                        tol=self.tol,
+                                        class_weight=self.class_weight)
         self.estimator.fit(X, Y)
         return self
 

diff --git a/ParamSklearn/components/classification/sgd.py b/ParamSklearn/components/classification/sgd.py
@@ -13,7 +13,7 @@
 
 class SGD(ParamSklearnClassificationAlgorithm):
     def __init__(self, loss, penalty, alpha, fit_intercept, n_iter,
-                 learning_rate, class_weight, l1_ratio=0.15, epsilon=0.1,
+                 learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1,
                  eta0=0.01, power_t=0.5, random_state=None):
         self.loss = loss
         self.penalty = penalty
@@ -111,10 +111,6 @@ def get_hyperparameter_search_space(dataset_properties=None):
             ["optimal", "invscaling", "constant"], default="optimal")
         eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01)
         power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.5)
-        # This does not allow for other resampling methods!
-        class_weight = CategoricalHyperparameter("class_weight",
-                                                 ["None", "auto"],
-                                                 default="None")
         cs = ConfigurationSpace()
         cs.add_hyperparameter(loss)
         cs.add_hyperparameter(penalty)
@@ -126,7 +122,6 @@ def get_hyperparameter_search_space(dataset_properties=None):
         cs.add_hyperparameter(learning_rate)
         cs.add_hyperparameter(eta0)
         cs.add_hyperparameter(power_t)
-        cs.add_hyperparameter(class_weight)
 
         # TODO add passive/aggressive here, although not properly documented?
         elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")

diff --git a/ParamSklearn/components/preprocessing/balancing.py b/ParamSklearn/components/preprocessing/balancing.py
@@ -0,0 +1,113 @@
+import numpy as np
+
+from HPOlibConfigSpace.configuration_space import ConfigurationSpace
+from HPOlibConfigSpace.hyperparameters import CategoricalHyperparameter
+
+from ParamSklearn.components.preprocessor_base import \
+    ParamSklearnPreprocessingAlgorithm
+from ParamSklearn.util import DENSE, SPARSE, INPUT
+
+
+class Balancing(ParamSklearnPreprocessingAlgorithm):
+    def __init__(self, strategy, random_state=None):
+        self.strategy = strategy
+
+    def fit(self, X, y=None):
+        raise NotImplementedError()
+
+    def transform(self, X):
+        raise NotImplementedError()
+
+    def get_weights(self, Y, classifier, preprocessor, init_params, fit_params):
+        if init_params is None:
+            init_params = {}
+
+        if fit_params is None:
+            fit_params = {}
+
+        # Classifiers which require sample weights:
+        # We can have adaboost in here, because in the fit method,
+        # the sample weights are normalized:
+        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/ensemble/weight_boosting.py#L121
+        clf_ = ['adaboost', 'decision_tree', 'extra_trees', 'random_forest',
+                'gradient_boosting']
+        pre_ = ['extra_trees_preproc_for_classification']
+        if classifier in clf_ or preprocessor in pre_:
+            if len(Y.shape) > 1:
+                offsets = [2 ** i for i in range(Y.shape[1])]
+                Y_ = np.sum(Y * offsets, axis=1)
+            else:
+                Y_ = Y
+
+            unique, counts = np.unique(Y_, return_counts=True)
+            cw = 1. / counts
+            cw = cw / np.mean(cw)
+
+            sample_weights = np.ones(Y_.shape)
+
+            for i, ue in enumerate(unique):
+                mask = Y_ == ue
+                sample_weights[mask] *= cw[i]
+
+            if classifier in clf_:
+                fit_params['%s:sample_weight' % classifier] = sample_weights
+            if preprocessor in pre_:
+                fit_params['%s:sample_weight' % preprocessor] = sample_weights
+
+        # Classifiers which can adjust sample weights themselves via the
+        # argument `class_weight`
+        clf_ = ['liblinear_svc', 'libsvm_svc', 'sgd']
+        pre_ = ['liblinear_svc_preprocessor']
+        if classifier in clf_:
+            init_params['%s:class_weight' % classifier] = 'auto'
+        if preprocessor in pre_:
+            init_params['%s:class_weight' % preprocessor] = 'auto'
+
+        clf_ = ['ridge']
+        if classifier in clf_:
+            class_weights = {}
+
+            unique, counts = np.unique(Y, return_counts=True)
+            cw = 1. / counts
+            cw = cw / np.mean(cw)
+
+            for i, ue in enumerate(unique):
+                class_weights[ue] = cw[i]
+
+            if classifier in clf_:
+                init_params['%s:class_weight' % classifier] = class_weights
+
+        return init_params, fit_params
+
+    @staticmethod
+    def get_properties():
+        return {'shortname': 'Balancing',
+                'name': 'Balancing Imbalanced Class Distributions',
+                'handles_missing_values': True,
+                'handles_nominal_values': True,
+                'handles_numerical_features': True,
+                'prefers_data_scaled': False,
+                'prefers_data_normalized': False,
+                'handles_regression': False,
+                'handles_classification': True,
+                'handles_multiclass': True,
+                'handles_multilabel': True,
+                'is_deterministic': True,
+                'handles_sparse': True,
+                'handles_dense': True,
+                'input': (DENSE, SPARSE),
+                'output': INPUT,
+                'preferred_dtype': None}
+
+    @staticmethod
+    def get_hyperparameter_search_space(dataset_properties=None):
+        # TODO add replace by zero!
+        strategy = CategoricalHyperparameter(
+            "strategy", ["none", "weighting"], default="none")
+        cs = ConfigurationSpace()
+        cs.add_hyperparameter(strategy)
+        return cs
+
+    def __str__(self):
+        name = self.get_properties()['name']
+        return "ParamSklearn %s" % name
diff --git a/...n/components/preprocessing/extra_trees.py → ...extra_trees_preproc_for_classification.py b/...n/components/preprocessing/extra_trees.py → ...extra_trees_preproc_for_classification.py
@@ -7,7 +7,7 @@
 
 from ParamSklearn.components.preprocessor_base import \
     ParamSklearnPreprocessingAlgorithm
-from ParamSklearn.util import DENSE, PREDICTIONS
+from ParamSklearn.util import DENSE, INPUT
 
 # get our own forests to replace the sklearn ones
 from ParamSklearn.implementations import forest
@@ -60,7 +60,7 @@ def __init__(self, n_estimators, criterion, min_samples_leaf,
         self.verbose = int(verbose)
         self.preprocessor = None
 
-    def fit(self, X, Y):
+    def fit(self, X, Y, sample_weight=None):
         num_features = X.shape[1]
         max_features = int(
             float(self.max_features) * (np.log(num_features) + 1))
@@ -80,7 +80,7 @@ def fit(self, X, Y):
         while len(self.preprocessor.estimators_) < self.n_estimators:
             tmp = self.preprocessor  # TODO copy ?
             tmp.n_estimators += self.estimator_increment
-            tmp.fit(X, Y)
+            tmp.fit(X, Y, sample_weight=sample_weight)
             self.preprocessor = tmp
         return self
 
@@ -106,7 +106,7 @@ def get_properties():
                 'is_deterministic': True,
                 'handles_sparse': False,
                 'input': (DENSE, ),
-                'output': PREDICTIONS,
+                'output': INPUT,
                 # TODO find out what is best used here!
                 # But rather fortran or C-contiguous?
                 'preferred_dtype': np.float32}

diff --git a/ParamSklearn/components/preprocessing/imputation.py b/ParamSklearn/components/preprocessing/imputation.py
@@ -1,4 +1,5 @@
-import ParamSklearn.implementations.Imputation
+#import ParamSklearn.implementations.Imputation
+import sklearn.preprocessing
 
 from HPOlibConfigSpace.configuration_space import ConfigurationSpace
 from HPOlibConfigSpace.hyperparameters import CategoricalHyperparameter
@@ -13,9 +14,9 @@ def __init__(self, strategy, random_state=None):
         self.strategy = strategy
 
     def fit(self, X, y=None):
-        self.preprocessor = ParamSklearn.implementations.Imputation.Imputer(
-            strategy=self.strategy, copy=False, dtype=X.dtype)
-        self.preprocessor.fit(X)
+        self.preprocessor = sklearn.preprocessing.Imputer(
+            strategy=self.strategy, copy=False) #, dtype=X.dtype)
+        self.preprocessor = self.preprocessor.fit(X)
         return self
 
     def transform(self, X):

diff --git a/...arn/components/preprocessing/liblinear.py → ...eprocessing/liblinear_svc_preprocessor.py b/...arn/components/preprocessing/liblinear.py → ...eprocessing/liblinear_svc_preprocessor.py
@@ -8,8 +8,7 @@
 
 from ParamSklearn.components.preprocessor_base import \
     ParamSklearnPreprocessingAlgorithm
-from ParamSklearn.implementations.util import softmax
-from ParamSklearn.util import SPARSE, DENSE, PREDICTIONS
+from ParamSklearn.util import SPARSE, DENSE, INPUT
 
 
 class LibLinear_Preprocessor(ParamSklearnPreprocessingAlgorithm):
@@ -73,7 +72,7 @@ def get_properties():
                 # this here suggests so http://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use
                 'handles_sparse': True,
                 'input': (SPARSE, DENSE),
-                'output': PREDICTIONS,
+                'output': INPUT,
                 # TODO find out what is best used here!
                 'preferred_dtype': None}
 

diff --git a/ParamSklearn/components/preprocessing/random_trees_embedding.py b/ParamSklearn/components/preprocessing/random_trees_embedding.py
@@ -39,7 +39,7 @@ def fit(self, X, Y=None):
             n_jobs=self.n_jobs,
             random_state=self.random_state
         )
-        self.preprocessor.fit(X)
+        self.preprocessor.fit(X, Y)
         return self
 
     def transform(self, X):

diff --git a/ParamSklearn/components/preprocessing/select_rates.py b/ParamSklearn/components/preprocessing/select_rates.py
@@ -35,7 +35,16 @@ def fit(self, X, y):
     def transform(self, X):
         if self.preprocessor is None:
             raise NotImplementedError()
-        Xt = self.preprocessor.transform(X)
+        try:
+            Xt = self.preprocessor.transform(X)
+        except ValueError as e:
+            if "zero-size array to reduction operation maximum which has no " \
+                    "identity" in e.message:
+                raise ValueError(
+                    "%s removed all features." % self.__class__.__name__)
+            else:
+                raise e
+
         if Xt.shape[1] == 0:
             raise ValueError(
                 "%s removed all features." % self.__class__.__name__)