Skip to content

[WIP] ENH: Class Senstive Scaling #416

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions imblearn/scaling/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""
The :mod:`imblearn.over_sampling` provides a set of method to
perform over-sampling.
"""

from .css import CSS

__all__ = ['CSS']
18 changes: 18 additions & 0 deletions imblearn/scaling/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
Base class for the over-sampling method.
"""
# Authors: Bernhard Schlegel <bernhard.schlegel@mytum.de>
# License: MIT


from ..base import BaseSampler


class BaseScaler(BaseSampler):
"""Base class for over-sampling algorithms.

Warning: This class should not be used directly. Use the derive classes
instead.
"""

_sampling_type = 'scaling'
243 changes: 243 additions & 0 deletions imblearn/scaling/css.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
"""Class to perform sample scaling using class specific scaling (CSS)."""
# Authors: Bernhard Schlegel <bernhard.schlegel@mytum.de>
# License: MIT


from __future__ import division, print_function
from collections import Counter
import random
import numpy as np
from sklearn.utils import check_random_state, safe_indexing

from .base import BaseScaler

CSS_MODE = ('linear', 'constant')
CSS_SAMPLING_STRATEGY = ('minority', 'majority', 'both')


class CSS(BaseScaler):
"""Class to perform sample scaling using class specific scaling (CSS).

Parameters
----------
mode : str (default = 'constant')
Defines the scaling mode. Currently, two modes are implemented: `'constant'`
and `'linear'`.

In `'constant'` mode, all samples of the `'sampling_strategy'` class will be scaled
by the same amount `c` to their class specific center. The following
formula will be applied to calculate the new feature (`X`) values:
`X[y==0] * (1-c) + col_means * c`

In `'linear'` mode, all samples will be scaled in depedence on their
distance and `c` to their class specific center. Samples, that are
one/unit standard deviation away from the class center will be scaled
with `c`. The following formula will be applied to calculate the new
feature (`X`) values:
`norm = distances * c + (1-c)`
`X[y==0] * (1-c) / norm + col_means * (distances * c) / norm


sampling_strategy : str (default = 'minority')
defines which class to scale. Possible values are 'minority', 'majority',
and 'both'. Note that all sample are scaled to their corresponding class
center.

c : float (default = 0.25)
Defines the amount of the scaling.

sampling_strategy_class_value: int (default = None)
class level indicating the minority class. By default (`None`) the minority
class will be automatically determined. Use any integer number (e.g. `0`,
`1` or `-1`) to force the minority class.

random_state : int, RandomState instance or None, optional (default=None)
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by np.random.

Attributes
----------
mode_ : str
CSS mode ('constant' or 'linear')

sampling_strategy_ : str or int
Name of the sampling_strategy class ('majority', 'minority', 'both')

sampling_strategy_class_value: int
class level indicating the minority class

c_ : dict of str/int : int
A dictionary in which the number of occurences of each class is
reported.

shuffle : Boolean
If True, results will be shuffled.

Examples
--------

>>> import numpy as np
>>> from sklearn.utils import shuffle
>>> from imblearn.scaling import CSS
>>> rng = np.random.RandomState(42)
>>> n_samples_1 = 50
>>> n_samples_2 = 5
>>> X_syn = np.r_[1.5 * rng.randn(n_samples_1, 2), 0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
>>> y_syn = np.array([0] * (n_samples_1) + [1] * (n_samples_2))
>>> X_syn, y_syn = shuffle(X_syn, y_syn)
>>> css = CSS(mode="linear", sampling_strategy="both", c=0.1, shuffle=True)
>>> X_train_res, y_train_res = css.fit_sample(X_syn, y_syn)

References
----------
.. [1] B. Schlegel, and B. Sick. "Dealing with class imbalance the scalable way:
Evaluation of various techniques based on classification grade and computational
complexity." 2017 IEEE International Conference on Data Mining Workshops, 2017.
"""

def __init__(self,
sampling_strategy='minority',
mode='linear',
c=0.25,
minority_class_value=None,
shuffle=True):
super(CSS, self).__init__()
self.sampling_strategy = sampling_strategy
self.mode = mode
self.c = c
self.minority_class_value = minority_class_value
self.shuffle = shuffle

def fit(self, X, y):
"""Find the classes statistics before to perform sampling.

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Matrix containing the data which have to be scaled.

y : ndarray, shape (n_samples, )
Corresponding label for each sample in X.

Returns
-------
self : object,
Return self.

"""
super(CSS, self).fit(X, y)

if self.mode not in CSS_MODE:
raise ValueError('Unknown kind for CSS mode.'
' Choices are {}. Got {} instead.'.format(
CSS_MODE, self.mode))

if self.sampling_strategy not in CSS_SAMPLING_STRATEGY:
raise ValueError('Unknown kind for CSS sampling_strategy.'
' Choices are {}. Got {} instead.'.format(
CSS_SAMPLING_STRATEGY, self.sampling_strategy))

if self.c < 0 or self.c > 1:
raise ValueError('Received scaling factor c={}, which'
' is outside the allowed range '
'(0-1].'.format(self.c))
if self.c is 0:
raise ValueError('Received scaling factor c={}, which is'
' equal to no CSS at.'.format(self.c))

if (self.minority_class_value is not None and
not isinstance(self.minority_class_value, int)):
raise ValueError('Unallowed sampling_strategy class value \'{}\'.'
' Valid values include None to automatically'
' infer the sampling_strategy class or any integer number'
' corresponding to the value of the label in y')

return self

def _shuffleTwo(self, a, b):

indexes = np.array(range(0, len(a)))
random.shuffle(indexes)
a2, b2 = a[indexes], b[indexes]

return a2, b2, indexes

def _sample(self, X, y):
"""scales the dataset.

Parameters
----------
X : ndarray, shape (n_samples, n_features)
Matrix containing the data which have to be sampled.

y : ndarray, shape (n_samples, )
Corresponding label for each sample in X.

Returns
-------
X_scaled : ndarray, shape (n_samples, n_features)
The array containing the resampled data.

y_scaled : ndarray, shape (n_samples)
The corresponding label of `X_scaled`

"""

minority_class = self.minority_class_value
if minority_class is None:
# infer minority class value
counts = Counter(y)
least_common = counts.most_common()[:-1-1:-1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is difficult to read.

mcv = min(counts, key=counts.get)

You should check the other sampler but we call counts -> target_stats I think

minority_class = least_common[0][0]

# get indices for later, safe indexing
majority_class_indices = (y != minority_class)
minority_class_indices = (y == minority_class)

# in the following _majority is majority, _minority is minority
if self.sampling_strategy is "majority" or self.sampling_strategy is "both":
# mean_majority_class is the mean of all features (=columns)
mean_majority_class = np.mean(safe_indexing(X, majority_class_indices), axis=0)
if self.mode is "linear":
distances_majority = abs(np.subtract(safe_indexing(X, majority_class_indices), mean_majority_class))
if self.sampling_strategy is "minority" or self.sampling_strategy is "both":
mean_minority_class = np.mean(safe_indexing(X, minority_class_indices), axis=0)
if self.mode is "linear":
distances_minority = abs(np.subtract(safe_indexing(X, minority_class_indices), mean_minority_class))

if self.sampling_strategy is "majority" or self.sampling_strategy is "both":
if self.mode is "constant":
X_scaled_majority = safe_indexing(X, majority_class_indices) * (1 - self.c) + mean_majority_class * self.c
elif self.mode is "linear":
scale_factors_mean = (distances_majority * self.c)
scale_factors_values = (1 - self.c * distances_majority)

X_scaled_majority = safe_indexing(X, majority_class_indices) * scale_factors_values + mean_majority_class * scale_factors_mean
if self.sampling_strategy is "minority" or self.sampling_strategy is "both":
if self.mode is "constant":
X_scaled_minority = safe_indexing(X, minority_class_indices) * (1 - self.c) + mean_minority_class * self.c
elif self.mode is "linear":
scale_factors_mean = (distances_minority * self.c)
scale_factors_values = (1 - self.c * distances_minority)
X_scaled_minority = safe_indexing(X, minority_class_indices) * scale_factors_values + mean_minority_class * scale_factors_mean

# merge scaled and non scaled stuff
if self.sampling_strategy is "majority":
X_scaled = np.concatenate([X_scaled_majority, safe_indexing(X, minority_class_indices)], axis=0)
elif self.sampling_strategy is "minority":
X_scaled = np.concatenate([safe_indexing(X, majority_class_indices), X_scaled_minority], axis=0)
else: #"both"
X_scaled = np.concatenate([X_scaled_majority, X_scaled_minority], axis=0)

# make sure that y is in same order like X
y_assembled = np.concatenate([y[majority_class_indices], y[minority_class_indices]], axis=0)

# shuffle
X_scaled_shuffled, y_res_shuffled, indices = self._shuffleTwo(X_scaled, y_assembled)

if self.shuffle:
return X_scaled_shuffled, y_res_shuffled
else:
return X_scaled, y_assembled
Empty file.
Loading