-
Notifications
You must be signed in to change notification settings - Fork 1.3k
/
_adasyn.py
235 lines (191 loc) · 8.39 KB
/
_adasyn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""Class to perform over-sampling using ADASYN."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Christos Aridas
# License: MIT
import numbers
import warnings
import numpy as np
from scipy import sparse
from sklearn.utils import _safe_indexing, check_random_state
from ..utils import Substitution, check_neighbors_object
from ..utils._docstring import _n_jobs_docstring, _random_state_docstring
from ..utils._param_validation import HasMethods, Interval
from .base import BaseOverSampler
@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
n_jobs=_n_jobs_docstring,
random_state=_random_state_docstring,
)
class ADASYN(BaseOverSampler):
"""Oversample using Adaptive Synthetic (ADASYN) algorithm.
This method is similar to SMOTE but it generates different number of
samples depending on an estimate of the local distribution of the class
to be oversampled.
Read more in the :ref:`User Guide <smote_adasyn>`.
Parameters
----------
{sampling_strategy}
{random_state}
n_neighbors : int or estimator object, default=5
The nearest neighbors used to define the neighborhood of samples to use
to generate the synthetic samples. You can pass:
- an `int` corresponding to the number of neighbors to use. A
`~sklearn.neighbors.NearestNeighbors` instance will be fitted in this
case.
- an instance of a compatible nearest neighbors algorithm that should
implement both methods `kneighbors` and `kneighbors_graph`. For
instance, it could correspond to a
:class:`~sklearn.neighbors.NearestNeighbors` but could be extended to
any compatible class.
{n_jobs}
.. deprecated:: 0.10
`n_jobs` has been deprecated in 0.10 and will be removed in 0.12.
It was previously used to set `n_jobs` of nearest neighbors
algorithm. From now on, you can pass an estimator where `n_jobs` is
already set instead.
Attributes
----------
sampling_strategy_ : dict
Dictionary containing the information to sample the dataset. The keys
corresponds to the class labels from which to sample and the values
are the number of samples to sample.
nn_ : estimator object
Validated K-nearest Neighbours estimator linked to the parameter `n_neighbors`.
n_features_in_ : int
Number of features in the input dataset.
.. versionadded:: 0.9
feature_names_in_ : ndarray of shape (`n_features_in_`,)
Names of features seen during `fit`. Defined only when `X` has feature
names that are all strings.
.. versionadded:: 0.10
See Also
--------
SMOTE : Over-sample using SMOTE.
SMOTENC : Over-sample using SMOTE for continuous and categorical features.
SMOTEN : Over-sample using the SMOTE variant specifically for categorical
features only.
SVMSMOTE : Over-sample using SVM-SMOTE variant.
BorderlineSMOTE : Over-sample using Borderline-SMOTE variant.
Notes
-----
The implementation is based on [1]_.
Supports multi-class resampling. A one-vs.-rest scheme is used.
References
----------
.. [1] He, Haibo, Yang Bai, Edwardo A. Garcia, and Shutao Li. "ADASYN:
Adaptive synthetic sampling approach for imbalanced learning," In IEEE
International Joint Conference on Neural Networks (IEEE World Congress
on Computational Intelligence), pp. 1322-1328, 2008.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from imblearn.over_sampling import ADASYN
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000,
... random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> ada = ADASYN(random_state=42)
>>> X_res, y_res = ada.fit_resample(X, y)
>>> print('Resampled dataset shape %s' % Counter(y_res))
Resampled dataset shape Counter({{0: 904, 1: 900}})
"""
_parameter_constraints: dict = {
**BaseOverSampler._parameter_constraints,
"n_neighbors": [
Interval(numbers.Integral, 1, None, closed="left"),
HasMethods(["kneighbors", "kneighbors_graph"]),
],
"n_jobs": [numbers.Integral, None],
}
def __init__(
self,
*,
sampling_strategy="auto",
random_state=None,
n_neighbors=5,
n_jobs=None,
):
super().__init__(sampling_strategy=sampling_strategy)
self.random_state = random_state
self.n_neighbors = n_neighbors
self.n_jobs = n_jobs
def _validate_estimator(self):
"""Create the necessary objects for ADASYN"""
self.nn_ = check_neighbors_object(
"n_neighbors", self.n_neighbors, additional_neighbor=1
)
def _fit_resample(self, X, y):
# FIXME: to be removed in 0.12
if self.n_jobs is not None:
warnings.warn(
(
"The parameter `n_jobs` has been deprecated in 0.10 and will be"
" removed in 0.12. You can pass an nearest neighbors estimator"
" where `n_jobs` is already set instead."
),
FutureWarning,
)
self._validate_estimator()
random_state = check_random_state(self.random_state)
X_resampled = [X.copy()]
y_resampled = [y.copy()]
for class_sample, n_samples in self.sampling_strategy_.items():
if n_samples == 0:
continue
target_class_indices = np.flatnonzero(y == class_sample)
X_class = _safe_indexing(X, target_class_indices)
self.nn_.fit(X)
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
# The ratio is computed using a one-vs-rest manner. Using majority
# in multi-class would lead to slightly different results at the
# cost of introducing a new parameter.
n_neighbors = self.nn_.n_neighbors - 1
ratio_nn = np.sum(y[nns] != class_sample, axis=1) / n_neighbors
if not np.sum(ratio_nn):
raise RuntimeError(
"Not any neigbours belong to the majority"
" class. This case will induce a NaN case"
" with a division by zero. ADASYN is not"
" suited for this specific dataset."
" Use SMOTE instead."
)
ratio_nn /= np.sum(ratio_nn)
n_samples_generate = np.rint(ratio_nn * n_samples).astype(int)
# rounding may cause new amount for n_samples
n_samples = np.sum(n_samples_generate)
if not n_samples:
raise ValueError(
"No samples will be generated with the provided ratio settings."
)
# the nearest neighbors need to be fitted only on the current class
# to find the class NN to generate new samples
self.nn_.fit(X_class)
nns = self.nn_.kneighbors(X_class, return_distance=False)[:, 1:]
enumerated_class_indices = np.arange(len(target_class_indices))
rows = np.repeat(enumerated_class_indices, n_samples_generate)
cols = random_state.choice(n_neighbors, size=n_samples)
diffs = X_class[nns[rows, cols]] - X_class[rows]
steps = random_state.uniform(size=(n_samples, 1))
if sparse.issparse(X):
sparse_func = type(X).__name__
steps = getattr(sparse, sparse_func)(steps)
X_new = X_class[rows] + steps.multiply(diffs)
else:
X_new = X_class[rows] + steps * diffs
X_new = X_new.astype(X.dtype)
y_new = np.full(n_samples, fill_value=class_sample, dtype=y.dtype)
X_resampled.append(X_new)
y_resampled.append(y_new)
if sparse.issparse(X):
X_resampled = sparse.vstack(X_resampled, format=X.format)
else:
X_resampled = np.vstack(X_resampled)
y_resampled = np.hstack(y_resampled)
return X_resampled, y_resampled
def _more_tags(self):
return {
"X_types": ["2darray"],
}