Skip to content

Commit

Permalink
Feature: weighting for imbalanced classes
Browse files Browse the repository at this point in the history
  • Loading branch information
mfeurer committed May 2, 2015
1 parent a942472 commit 6de26d7
Show file tree
Hide file tree
Showing 26 changed files with 341 additions and 63 deletions.
4 changes: 2 additions & 2 deletions ParamSklearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ def fit(self, X, Y, fit_params=None, init_params=None):

# seperate the init parameters for the single methods
init_params_per_method = defaultdict(dict)
if init_params is not None:
for init_param, value in init_params:
if init_params is not None and len(init_params) != 0:
for init_param, value in init_params.items():
method, param = init_param.split(":")
init_params_per_method[method][param] = value

Expand Down
16 changes: 14 additions & 2 deletions ParamSklearn/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ParamSklearn import components as components
from ParamSklearn.base import ParamSklearnBaseEstimator
from ParamSklearn.util import SPARSE
from ParamSklearn.components.preprocessing.balancing import Balancing
import ParamSklearn.create_searchspace_util


Expand Down Expand Up @@ -61,9 +62,19 @@ class ParamSklearnClassifier(ClassifierMixin, ParamSklearnBaseEstimator):
"""

def fit(self, X, Y, fit_params=None, init_params=None):
self.num_targets = 1 if len(Y.shape) == 1 else Y.shape[1]

# Weighting samples has to be done here, not in the components
if self.configuration['balancing:strategy'].value == 'weighting':
balancing = Balancing(strategy='weighting')
init_params, fit_params = balancing.get_weights(
Y, self.configuration['classifier'].value,
self.configuration['preprocessor'].value,
init_params, fit_params)

super(ParamSklearnClassifier, self).fit(X, Y, fit_params=fit_params,
init_params=init_params)
self.num_targets = 1 if len(Y.shape) == 1 else Y.shape[1]

return self

def predict_proba(self, X, batch_size=None):
Expand Down Expand Up @@ -415,4 +426,5 @@ def _get_estimator_components():

@staticmethod
def _get_pipeline():
return ["imputation", "rescaling", "__preprocessor__", "__estimator__"]
return ["imputation", "rescaling", "balancing", "__preprocessor__",
"__estimator__"]
5 changes: 2 additions & 3 deletions ParamSklearn/components/classification/adaboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __init__(self, n_estimators, learning_rate, algorithm='SAMME.R',

self.estimator = None

def fit(self, X, Y):
def fit(self, X, Y, sample_weight=None):
base_estimator = sklearn.tree.DecisionTreeClassifier(max_depth=self.max_depth)

self.estimator = sklearn.ensemble.AdaBoostClassifier(
Expand All @@ -34,9 +34,8 @@ def fit(self, X, Y):
learning_rate=self.learning_rate,
algorithm=self.algorithm,
random_state=self.random_state

)
self.estimator.fit(X, Y)
self.estimator.fit(X, Y, sample_weight=sample_weight)
return self

def predict(self, X):
Expand Down
4 changes: 2 additions & 2 deletions ParamSklearn/components/classification/decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ def __init__(self, criterion, max_features, max_depth,
self.random_state = random_state
self.estimator = None

def fit(self, X, y):
def fit(self, X, y, sample_weight=None):
self.estimator = DecisionTreeClassifier(
criterion=self.criterion,
max_depth=self.max_depth,
min_samples_split=self.min_samples_split,
min_samples_leaf=self.min_samples_leaf,
max_leaf_nodes=self.max_leaf_nodes,
random_state=self.random_state)
self.estimator.fit(X, y)
self.estimator.fit(X, y, sample_weight=sample_weight)
return self

def predict(self, X):
Expand Down
4 changes: 2 additions & 2 deletions ParamSklearn/components/classification/extra_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(self, n_estimators, criterion, min_samples_leaf,
self.verbose = int(verbose)
self.estimator = None

def fit(self, X, Y):
def fit(self, X, Y, sample_weight=None):
num_features = X.shape[1]
max_features = int(float(self.max_features) * (np.log(num_features) + 1))
# Use at most half of the features
Expand All @@ -78,7 +78,7 @@ def fit(self, X, Y):
while len(self.estimator.estimators_) < self.n_estimators:
tmp = self.estimator # TODO copy ?
tmp.n_estimators += self.estimator_increment
tmp.fit(X, Y)
tmp.fit(X, Y, sample_weight=sample_weight)
self.estimator = tmp
return self

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from HPOlibConfigSpace.hyperparameters import UniformFloatHyperparameter, \
CategoricalHyperparameter, UnParametrizedHyperparameter, \
UniformIntegerHyperparameter
from HPOlibConfigSpace.conditions import EqualsCondition

from ParamSklearn.components.classification_base import \
ParamSklearnClassificationAlgorithm
Expand Down
4 changes: 2 additions & 2 deletions ParamSklearn/components/classification/random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self, n_estimators, criterion, max_features,
self.n_jobs = n_jobs
self.estimator = None

def fit(self, X, Y):
def fit(self, X, Y, sample_weight=None):
self.n_estimators = int(self.n_estimators)

if self.max_depth == "None":
Expand Down Expand Up @@ -67,7 +67,7 @@ def fit(self, X, Y):
while len(self.estimator.estimators_) < self.n_estimators:
tmp = self.estimator # TODO I think we need to copy here!
tmp.n_estimators += self.estimator_increment
tmp.fit(X, Y)
tmp.fit(X, Y, sample_weight=sample_weight)
self.estimator = tmp
return self

Expand Down
7 changes: 5 additions & 2 deletions ParamSklearn/components/classification/ridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,20 @@


class Ridge(ParamSklearnClassificationAlgorithm):
def __init__(self, alpha, fit_intercept, tol, random_state=None):
def __init__(self, alpha, fit_intercept, tol, class_weight=None,
random_state=None):
self.alpha = float(alpha)
self.fit_intercept = bool(fit_intercept)
self.tol = float(tol)
self.class_weight = class_weight
self.random_state = random_state
self.estimator = None

def fit(self, X, Y):
self.estimator = RidgeClassifier(alpha=self.alpha,
fit_intercept=self.fit_intercept,
tol=self.tol)
tol=self.tol,
class_weight=self.class_weight)
self.estimator.fit(X, Y)
return self

Expand Down
7 changes: 1 addition & 6 deletions ParamSklearn/components/classification/sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

class SGD(ParamSklearnClassificationAlgorithm):
def __init__(self, loss, penalty, alpha, fit_intercept, n_iter,
learning_rate, class_weight, l1_ratio=0.15, epsilon=0.1,
learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1,
eta0=0.01, power_t=0.5, random_state=None):
self.loss = loss
self.penalty = penalty
Expand Down Expand Up @@ -111,10 +111,6 @@ def get_hyperparameter_search_space(dataset_properties=None):
["optimal", "invscaling", "constant"], default="optimal")
eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01)
power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.5)
# This does not allow for other resampling methods!
class_weight = CategoricalHyperparameter("class_weight",
["None", "auto"],
default="None")
cs = ConfigurationSpace()
cs.add_hyperparameter(loss)
cs.add_hyperparameter(penalty)
Expand All @@ -126,7 +122,6 @@ def get_hyperparameter_search_space(dataset_properties=None):
cs.add_hyperparameter(learning_rate)
cs.add_hyperparameter(eta0)
cs.add_hyperparameter(power_t)
cs.add_hyperparameter(class_weight)

# TODO add passive/aggressive here, although not properly documented?
elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
Expand Down
113 changes: 113 additions & 0 deletions ParamSklearn/components/preprocessing/balancing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import numpy as np

from HPOlibConfigSpace.configuration_space import ConfigurationSpace
from HPOlibConfigSpace.hyperparameters import CategoricalHyperparameter

from ParamSklearn.components.preprocessor_base import \
ParamSklearnPreprocessingAlgorithm
from ParamSklearn.util import DENSE, SPARSE, INPUT


class Balancing(ParamSklearnPreprocessingAlgorithm):
def __init__(self, strategy, random_state=None):
self.strategy = strategy

def fit(self, X, y=None):
raise NotImplementedError()

def transform(self, X):
raise NotImplementedError()

def get_weights(self, Y, classifier, preprocessor, init_params, fit_params):
if init_params is None:
init_params = {}

if fit_params is None:
fit_params = {}

# Classifiers which require sample weights:
# We can have adaboost in here, because in the fit method,
# the sample weights are normalized:
# https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/ensemble/weight_boosting.py#L121
clf_ = ['adaboost', 'decision_tree', 'extra_trees', 'random_forest',
'gradient_boosting']
pre_ = ['extra_trees_preproc_for_classification']
if classifier in clf_ or preprocessor in pre_:
if len(Y.shape) > 1:
offsets = [2 ** i for i in range(Y.shape[1])]
Y_ = np.sum(Y * offsets, axis=1)
else:
Y_ = Y

unique, counts = np.unique(Y_, return_counts=True)
cw = 1. / counts
cw = cw / np.mean(cw)

sample_weights = np.ones(Y_.shape)

for i, ue in enumerate(unique):
mask = Y_ == ue
sample_weights[mask] *= cw[i]

if classifier in clf_:
fit_params['%s:sample_weight' % classifier] = sample_weights
if preprocessor in pre_:
fit_params['%s:sample_weight' % preprocessor] = sample_weights

# Classifiers which can adjust sample weights themselves via the
# argument `class_weight`
clf_ = ['liblinear_svc', 'libsvm_svc', 'sgd']
pre_ = ['liblinear_svc_preprocessor']
if classifier in clf_:
init_params['%s:class_weight' % classifier] = 'auto'
if preprocessor in pre_:
init_params['%s:class_weight' % preprocessor] = 'auto'

clf_ = ['ridge']
if classifier in clf_:
class_weights = {}

unique, counts = np.unique(Y, return_counts=True)
cw = 1. / counts
cw = cw / np.mean(cw)

for i, ue in enumerate(unique):
class_weights[ue] = cw[i]

if classifier in clf_:
init_params['%s:class_weight' % classifier] = class_weights

return init_params, fit_params

@staticmethod
def get_properties():
return {'shortname': 'Balancing',
'name': 'Balancing Imbalanced Class Distributions',
'handles_missing_values': True,
'handles_nominal_values': True,
'handles_numerical_features': True,
'prefers_data_scaled': False,
'prefers_data_normalized': False,
'handles_regression': False,
'handles_classification': True,
'handles_multiclass': True,
'handles_multilabel': True,
'is_deterministic': True,
'handles_sparse': True,
'handles_dense': True,
'input': (DENSE, SPARSE),
'output': INPUT,
'preferred_dtype': None}

@staticmethod
def get_hyperparameter_search_space(dataset_properties=None):
# TODO add replace by zero!
strategy = CategoricalHyperparameter(
"strategy", ["none", "weighting"], default="none")
cs = ConfigurationSpace()
cs.add_hyperparameter(strategy)
return cs

def __str__(self):
name = self.get_properties()['name']
return "ParamSklearn %s" % name
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from ParamSklearn.components.preprocessor_base import \
ParamSklearnPreprocessingAlgorithm
from ParamSklearn.util import DENSE, PREDICTIONS
from ParamSklearn.util import DENSE, INPUT

# get our own forests to replace the sklearn ones
from ParamSklearn.implementations import forest
Expand Down Expand Up @@ -60,7 +60,7 @@ def __init__(self, n_estimators, criterion, min_samples_leaf,
self.verbose = int(verbose)
self.preprocessor = None

def fit(self, X, Y):
def fit(self, X, Y, sample_weight=None):
num_features = X.shape[1]
max_features = int(
float(self.max_features) * (np.log(num_features) + 1))
Expand All @@ -80,7 +80,7 @@ def fit(self, X, Y):
while len(self.preprocessor.estimators_) < self.n_estimators:
tmp = self.preprocessor # TODO copy ?
tmp.n_estimators += self.estimator_increment
tmp.fit(X, Y)
tmp.fit(X, Y, sample_weight=sample_weight)
self.preprocessor = tmp
return self

Expand All @@ -106,7 +106,7 @@ def get_properties():
'is_deterministic': True,
'handles_sparse': False,
'input': (DENSE, ),
'output': PREDICTIONS,
'output': INPUT,
# TODO find out what is best used here!
# But rather fortran or C-contiguous?
'preferred_dtype': np.float32}
Expand Down
9 changes: 5 additions & 4 deletions ParamSklearn/components/preprocessing/imputation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import ParamSklearn.implementations.Imputation
#import ParamSklearn.implementations.Imputation
import sklearn.preprocessing

from HPOlibConfigSpace.configuration_space import ConfigurationSpace
from HPOlibConfigSpace.hyperparameters import CategoricalHyperparameter
Expand All @@ -13,9 +14,9 @@ def __init__(self, strategy, random_state=None):
self.strategy = strategy

def fit(self, X, y=None):
self.preprocessor = ParamSklearn.implementations.Imputation.Imputer(
strategy=self.strategy, copy=False, dtype=X.dtype)
self.preprocessor.fit(X)
self.preprocessor = sklearn.preprocessing.Imputer(
strategy=self.strategy, copy=False) #, dtype=X.dtype)
self.preprocessor = self.preprocessor.fit(X)
return self

def transform(self, X):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@

from ParamSklearn.components.preprocessor_base import \
ParamSklearnPreprocessingAlgorithm
from ParamSklearn.implementations.util import softmax
from ParamSklearn.util import SPARSE, DENSE, PREDICTIONS
from ParamSklearn.util import SPARSE, DENSE, INPUT


class LibLinear_Preprocessor(ParamSklearnPreprocessingAlgorithm):
Expand Down Expand Up @@ -73,7 +72,7 @@ def get_properties():
# this here suggests so http://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use
'handles_sparse': True,
'input': (SPARSE, DENSE),
'output': PREDICTIONS,
'output': INPUT,
# TODO find out what is best used here!
'preferred_dtype': None}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def fit(self, X, Y=None):
n_jobs=self.n_jobs,
random_state=self.random_state
)
self.preprocessor.fit(X)
self.preprocessor.fit(X, Y)
return self

def transform(self, X):
Expand Down
11 changes: 10 additions & 1 deletion ParamSklearn/components/preprocessing/select_rates.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,16 @@ def fit(self, X, y):
def transform(self, X):
if self.preprocessor is None:
raise NotImplementedError()
Xt = self.preprocessor.transform(X)
try:
Xt = self.preprocessor.transform(X)
except ValueError as e:
if "zero-size array to reduction operation maximum which has no " \
"identity" in e.message:
raise ValueError(
"%s removed all features." % self.__class__.__name__)
else:
raise e

if Xt.shape[1] == 0:
raise ValueError(
"%s removed all features." % self.__class__.__name__)
Expand Down
Loading

0 comments on commit 6de26d7

Please sign in to comment.