From b3621e77fe33a10e54ebf2b0d1d186fa62a35c75 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 10 Jul 2023 00:04:30 +0200 Subject: [PATCH 1/2] API deprecate estimator_ in favor of estimators_ --- .../_condensed_nearest_neighbour.py | 18 ++++++++++------ .../_one_sided_selection.py | 11 ++++++++-- .../tests/test_condensed_nearest_neighbour.py | 21 +++++++++++++++++++ 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index c80e40fdc..12a348415 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -59,6 +59,11 @@ class CondensedNearestNeighbour(BaseCleaningSampler): estimator_ : estimator object The validated K-nearest neighbor estimator created from `n_neighbors` parameter. + .. deprecated:: 0.11 + + estimators_ : list of estimator objects of shape (n_resampled_classes - 1,) + Contains the K-nearest neighbor estimator used for per of classes. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. @@ -87,8 +92,8 @@ class CondensedNearestNeighbour(BaseCleaningSampler): ----- The method is based on [1]_. - Supports multi-class resampling. A one-vs.-rest scheme is used when - sampling a class as proposed in [1]_. + Supports multi-class resampling: a strategy one (minority) vs. each other + classes is applied. References ---------- @@ -158,6 +163,7 @@ def _fit_resample(self, X, y): class_minority = min(target_stats, key=target_stats.get) idx_under = np.empty((0,), dtype=int) + self.estimators_ = [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # Randomly get one sample from the majority class @@ -184,7 +190,7 @@ def _fit_resample(self, X, y): S_y = _safe_indexing(y, S_indices) # fit knn on C - self.estimator_.fit(C_x, C_y) + self.estimators_.append(clone(self.estimator_).fit(C_x, C_y)) good_classif_label = idx_maj_sample.copy() # Check each sample in S if we keep it or drop it @@ -196,7 +202,7 @@ def _fit_resample(self, X, y): # Classify on S if not issparse(x_sam): x_sam = x_sam.reshape(1, -1) - pred_y = self.estimator_.predict(x_sam) + pred_y = self.estimators_[-1].predict(x_sam) # If the prediction do not agree with the true label # append it in C_x @@ -210,12 +216,12 @@ def _fit_resample(self, X, y): C_y = _safe_indexing(y, C_indices) # fit a knn on C - self.estimator_.fit(C_x, C_y) + self.estimators_[-1].fit(C_x, C_y) # This experimental to speed up the search # Classify all the element in S and avoid to test the # well classified elements - pred_S_y = self.estimator_.predict(S_x) + pred_S_y = self.estimators_[-1].predict(S_x) good_classif_label = np.unique( np.append(idx_maj_sample, np.flatnonzero(pred_S_y == S_y)) ) diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 42e9a6edd..1a7b639c3 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -58,6 +58,12 @@ class OneSidedSelection(BaseCleaningSampler): estimator_ : estimator object Validated K-nearest neighbors estimator created from parameter `n_neighbors`. + .. deprecated:: 0.11 + Should be remove + + estimators_ : list of estimator objects of shape (n_resampled_classes - 1,) + Contains the K-nearest neighbor estimator used for per of classes. + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. @@ -155,6 +161,7 @@ def _fit_resample(self, X, y): idx_under = np.empty((0,), dtype=int) + self.estimators_ = [] for target_class in np.unique(y): if target_class in self.sampling_strategy_.keys(): # select a sample from the current class @@ -177,8 +184,8 @@ def _fit_resample(self, X, y): idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0) S_x = _safe_indexing(X, idx_maj_extracted) S_y = _safe_indexing(y, idx_maj_extracted) - self.estimator_.fit(C_x, C_y) - pred_S_y = self.estimator_.predict(S_x) + self.estimators_.append(clone(self.estimator_).fit(C_x, C_y)) + pred_S_y = self.estimators_[-1].predict(S_x) S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) idx_tmp = idx_maj_extracted[S_misclassified_indices] diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py index 5b41b8777..3d0dcdf7d 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from sklearn.datasets import make_classification from sklearn.neighbors import KNeighborsClassifier from sklearn.utils._testing import assert_array_equal @@ -95,3 +96,23 @@ def test_cnn_fit_resample_with_object(n_neighbors): X_resampled, y_resampled = cnn.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) + + +def test_condensed_nearest_neighbour_multiclass(): + X, y = make_classification( + n_samples=1_000, + n_classes=4, + weights=[0.1, 0.2, 0.2, 0.5], + n_clusters_per_class=1, + random_state=0, + ) + cnn = CondensedNearestNeighbour(random_state=RND_SEED) + cnn.fit_resample(X, y) + + assert len(cnn.estimators_) == len(cnn.sampling_strategy_) + other_classes = [] + for est in cnn.estimators_: + assert est.classes_[0] == 0 # minority class + assert est.classes_[1] in {1, 2, 3} # other classes + other_classes.append(est.classes_[1]) + assert len(set(other_classes)) == len(other_classes) From bc26060c6dc5ce4ccfe459d9062daa187b3adcab Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Mon, 10 Jul 2023 00:23:29 +0200 Subject: [PATCH 2/2] iter --- doc/whats_new/v0.12.rst | 10 +++++- .../_condensed_nearest_neighbour.py | 30 +++++++++++++---- .../_one_sided_selection.py | 31 ++++++++++++++---- .../tests/test_condensed_nearest_neighbour.py | 11 +++++++ .../tests/test_one_sided_selection.py | 32 +++++++++++++++++++ 5 files changed, 100 insertions(+), 14 deletions(-) diff --git a/doc/whats_new/v0.12.rst b/doc/whats_new/v0.12.rst index 8b6864e78..1063ffb05 100644 --- a/doc/whats_new/v0.12.rst +++ b/doc/whats_new/v0.12.rst @@ -3,6 +3,14 @@ Version 0.12.0 (Under development) ================================== - Changelog --------- + +Deprecations +............ + +- Deprecate `estimator_` argument in favor of `estimators_` for the classes + :class:`~imblearn.under_sampling.CondensedNearestNeighbour` and + :class:`~imblearn.under_sampling.OneSidedSelection`. `estimator_` will be removed + in 0.14. + :pr:`xxx` by :user:`Guillaume Lemaitre `. diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 12a348415..3b812e620 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -6,6 +6,7 @@ # License: MIT import numbers +import warnings from collections import Counter import numpy as np @@ -59,11 +60,16 @@ class CondensedNearestNeighbour(BaseCleaningSampler): estimator_ : estimator object The validated K-nearest neighbor estimator created from `n_neighbors` parameter. - .. deprecated:: 0.11 + .. deprecated:: 0.12 + `estimator_` is deprecated in 0.12 and will be removed in 0.14. Use + `estimators_` instead that contains the list of all K-nearest + neighbors estimator used for each pair of class. estimators_ : list of estimator objects of shape (n_resampled_classes - 1,) Contains the K-nearest neighbor estimator used for per of classes. + .. versionadded:: 0.12 + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. @@ -147,16 +153,18 @@ def __init__( def _validate_estimator(self): """Private function to create the NN estimator""" if self.n_neighbors is None: - self.estimator_ = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) + estimator = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) elif isinstance(self.n_neighbors, numbers.Integral): - self.estimator_ = KNeighborsClassifier( + estimator = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, KNeighborsClassifier): - self.estimator_ = clone(self.n_neighbors) + estimator = clone(self.n_neighbors) + + return estimator def _fit_resample(self, X, y): - self._validate_estimator() + estimator = self._validate_estimator() random_state = check_random_state(self.random_state) target_stats = Counter(y) @@ -190,7 +198,7 @@ def _fit_resample(self, X, y): S_y = _safe_indexing(y, S_indices) # fit knn on C - self.estimators_.append(clone(self.estimator_).fit(C_x, C_y)) + self.estimators_.append(clone(estimator).fit(C_x, C_y)) good_classif_label = idx_maj_sample.copy() # Check each sample in S if we keep it or drop it @@ -236,5 +244,15 @@ def _fit_resample(self, X, y): return _safe_indexing(X, idx_under), _safe_indexing(y, idx_under) + @property + def estimator_(self): + """Last fitted k-NN estimator.""" + warnings.warn( + "`estimator_` attribute has been deprecated in 0.12 and will be " + "removed in 0.14. Use `estimators_` instead.", + FutureWarning, + ) + return self.estimators_[-1] + def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 1a7b639c3..e0e5b4111 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -5,6 +5,7 @@ # License: MIT import numbers +import warnings from collections import Counter import numpy as np @@ -58,12 +59,16 @@ class OneSidedSelection(BaseCleaningSampler): estimator_ : estimator object Validated K-nearest neighbors estimator created from parameter `n_neighbors`. - .. deprecated:: 0.11 - Should be remove + .. deprecated:: 0.12 + `estimator_` is deprecated in 0.12 and will be removed in 0.14. Use + `estimators_` instead that contains the list of all K-nearest + neighbors estimator used for each pair of class. estimators_ : list of estimator objects of shape (n_resampled_classes - 1,) Contains the K-nearest neighbor estimator used for per of classes. + .. versionadded:: 0.12 + sample_indices_ : ndarray of shape (n_new_samples,) Indices of the samples selected. @@ -144,16 +149,18 @@ def __init__( def _validate_estimator(self): """Private function to create the NN estimator""" if self.n_neighbors is None: - self.estimator_ = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) + estimator = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) elif isinstance(self.n_neighbors, int): - self.estimator_ = KNeighborsClassifier( + estimator = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) elif isinstance(self.n_neighbors, KNeighborsClassifier): - self.estimator_ = clone(self.n_neighbors) + estimator = clone(self.n_neighbors) + + return estimator def _fit_resample(self, X, y): - self._validate_estimator() + estimator = self._validate_estimator() random_state = check_random_state(self.random_state) target_stats = Counter(y) @@ -184,7 +191,7 @@ def _fit_resample(self, X, y): idx_maj_extracted = np.delete(idx_maj, sel_idx_maj, axis=0) S_x = _safe_indexing(X, idx_maj_extracted) S_y = _safe_indexing(y, idx_maj_extracted) - self.estimators_.append(clone(self.estimator_).fit(C_x, C_y)) + self.estimators_.append(clone(estimator).fit(C_x, C_y)) pred_S_y = self.estimators_[-1].predict(S_x) S_misclassified_indices = np.flatnonzero(pred_S_y != S_y) @@ -206,5 +213,15 @@ def _fit_resample(self, X, y): return X_cleaned, y_cleaned + @property + def estimator_(self): + """Last fitted k-NN estimator.""" + warnings.warn( + "`estimator_` attribute has been deprecated in 0.12 and will be " + "removed in 0.14. Use `estimators_` instead.", + FutureWarning, + ) + return self.estimators_[-1] + def _more_tags(self): return {"sample_indices": True} diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py index 3d0dcdf7d..5cc8f4162 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_condensed_nearest_neighbour.py @@ -99,6 +99,7 @@ def test_cnn_fit_resample_with_object(n_neighbors): def test_condensed_nearest_neighbour_multiclass(): + """Check the validity of the fitted attributes `estimators_`.""" X, y = make_classification( n_samples=1_000, n_classes=4, @@ -116,3 +117,13 @@ def test_condensed_nearest_neighbour_multiclass(): assert est.classes_[1] in {1, 2, 3} # other classes other_classes.append(est.classes_[1]) assert len(set(other_classes)) == len(other_classes) + + +# TODO: remove in 0.14 +def test_condensed_nearest_neighbors_deprecation(): + """Check that we raise a FutureWarning when accessing the parameter `estimator_`.""" + cnn = CondensedNearestNeighbour(random_state=RND_SEED) + cnn.fit_resample(X, Y) + warn_msg = "`estimator_` attribute has been deprecated" + with pytest.warns(FutureWarning, match=warn_msg): + cnn.estimator_ diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py index 7d3adde0f..3fb5458c4 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_one_sided_selection.py @@ -5,6 +5,7 @@ import numpy as np import pytest +from sklearn.datasets import make_classification from sklearn.neighbors import KNeighborsClassifier from sklearn.utils._testing import assert_array_equal @@ -95,3 +96,34 @@ def test_oss_with_object(n_neighbors): X_resampled, y_resampled = oss.fit_resample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) + + +def test_one_sided_selection_multiclass(): + """Check the validity of the fitted attributes `estimators_`.""" + X, y = make_classification( + n_samples=1_000, + n_classes=4, + weights=[0.1, 0.2, 0.2, 0.5], + n_clusters_per_class=1, + random_state=0, + ) + oss = OneSidedSelection(random_state=RND_SEED) + oss.fit_resample(X, y) + + assert len(oss.estimators_) == len(oss.sampling_strategy_) + other_classes = [] + for est in oss.estimators_: + assert est.classes_[0] == 0 # minority class + assert est.classes_[1] in {1, 2, 3} # other classes + other_classes.append(est.classes_[1]) + assert len(set(other_classes)) == len(other_classes) + + +# TODO: remove in 0.14 +def test_one_sided_selection_deprecation(): + """Check that we raise a FutureWarning when accessing the parameter `estimator_`.""" + oss = OneSidedSelection(random_state=RND_SEED) + oss.fit_resample(X, Y) + warn_msg = "`estimator_` attribute has been deprecated" + with pytest.warns(FutureWarning, match=warn_msg): + oss.estimator_