From d17b6b5f032fbde78047450450a4f7f32d3239ab Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 2 Sep 2021 15:24:40 +0000 Subject: [PATCH 01/50] add duck-type check for KNeighbors-likeness --- imblearn/utils/_validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 7eb4099ea..53c2f73f3 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -95,6 +95,8 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) elif isinstance(nn_object, KNeighborsMixin): return clone(nn_object) + elif hasattr(nn_object, 'kneighbors') and hasattr(nn_object, 'kneighbors_graph'): + return clone(nn_object) else: raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) From 379ea7e5466ee73d04da5b353d09770a880a1f31 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 2 Sep 2021 15:32:43 +0000 Subject: [PATCH 02/50] removal ofKNeighborsMixin type check --- imblearn/utils/_validation.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 53c2f73f3..b20491dd1 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -93,8 +93,6 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """ if isinstance(nn_object, Integral): return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) - elif isinstance(nn_object, KNeighborsMixin): - return clone(nn_object) elif hasattr(nn_object, 'kneighbors') and hasattr(nn_object, 'kneighbors_graph'): return clone(nn_object) else: From 8790628ed3f496f70b5a69dec26558be96f8e766 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 9 Sep 2021 18:55:44 +0000 Subject: [PATCH 03/50] Added _is_neighbors_object() private validation function --- imblearn/utils/_validation.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index b20491dd1..c207afdc8 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -66,6 +66,12 @@ def _transfrom_one(self, array, props): ret = array return ret +def _is_neighbors_object(kneighbors_estimator): + neighbors_attributes = [ + "kneighbors", + "kneighbors_graph" + ] + return all(hasattr(kneighbors_estimator, attr) for attr in neighbors_attributes) def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. @@ -93,7 +99,7 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """ if isinstance(nn_object, Integral): return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) - elif hasattr(nn_object, 'kneighbors') and hasattr(nn_object, 'kneighbors_graph'): + elif _is_neighbors_object(nn_object): return clone(nn_object) else: raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) From e997e238ec2cd77365514c05f56d1211786896e7 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 9 Sep 2021 18:57:53 +0000 Subject: [PATCH 04/50] Addded pep8lank lines --- imblearn/utils/_validation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index c207afdc8..217bdc052 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -66,6 +66,7 @@ def _transfrom_one(self, array, props): ret = array return ret + def _is_neighbors_object(kneighbors_estimator): neighbors_attributes = [ "kneighbors", @@ -73,6 +74,7 @@ def _is_neighbors_object(kneighbors_estimator): ] return all(hasattr(kneighbors_estimator, attr) for attr in neighbors_attributes) + def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. From 94b072591aa23cf59db9dbbaae853e7091a13053 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Fri, 10 Sep 2021 16:36:44 +0000 Subject: [PATCH 05/50] change isinstance check for SVM estimator to simply clone the estimator - similar to _validate_estimator() in KMeansSMOTE --- imblearn/over_sampling/_smote/filter.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index 7a37b2c17..7d83f72e8 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -345,10 +345,8 @@ def _validate_estimator(self): if self.svm_estimator is None: self.svm_estimator_ = SVC(gamma="scale", random_state=self.random_state) - elif isinstance(self.svm_estimator, SVC): - self.svm_estimator_ = clone(self.svm_estimator) else: - raise_isinstance_error("svm_estimator", [SVC], self.svm_estimator) + self.svm_estimator_ = clone(self.svm_estimator) def _fit_resample(self, X, y): self._validate_estimator() From 9fbf360726fc8675d4438946a875baf7a5324502 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 13 Sep 2021 17:38:59 +0000 Subject: [PATCH 06/50] remove explicit class-check for KMeans estimator --- .../_prototype_generation/_cluster_centroids.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index 99a8e470f..e39dae387 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -123,13 +123,8 @@ def _validate_estimator(self): ) if self.estimator is None: self.estimator_ = KMeans(random_state=self.random_state) - elif isinstance(self.estimator, KMeans): - self.estimator_ = clone(self.estimator) else: - raise ValueError( - f"`estimator` has to be a KMeans clustering." - f" Got {type(self.estimator)} instead." - ) + self.estimator_ = clone(self.estimator) def _generate_sample(self, X, y, centroids, target_class): if self.voting_ == "hard": From f73687988dc65e771132ca1519337421db7737ff Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 13 Sep 2021 17:41:08 +0000 Subject: [PATCH 07/50] remove explicit class check for KNeighborsClassifier --- .../_prototype_selection/_one_sided_selection.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 305abec0b..dadbf001b 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -115,14 +115,9 @@ def _validate_estimator(self): self.estimator_ = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) - elif isinstance(self.n_neighbors, KNeighborsClassifier): - self.estimator_ = clone(self.n_neighbors) else: - raise ValueError( - f"`n_neighbors` has to be a int or an object" - f" inherited from KNeighborsClassifier." - f" Got {type(self.n_neighbors)} instead." - ) + self.estimator_ = clone(self.n_neighbors) + def _fit_resample(self, X, y): self._validate_estimator() From fcb118ead6a8bc9a51d14d2e759c418486ac08d5 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 13 Sep 2021 19:14:16 +0000 Subject: [PATCH 08/50] remove explicit class check for KNeighborsClassifier in CondensedNearestNeighbour --- .../_prototype_selection/_condensed_nearest_neighbour.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 738110cae..5d6391abd 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -119,14 +119,8 @@ def _validate_estimator(self): self.estimator_ = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) - elif isinstance(self.n_neighbors, KNeighborsClassifier): - self.estimator_ = clone(self.n_neighbors) else: - raise ValueError( - f"`n_neighbors` has to be a int or an object" - f" inhereited from KNeighborsClassifier." - f" Got {type(self.n_neighbors)} instead." - ) + self.estimator_ = clone(self.n_neighbors) def _fit_resample(self, X, y): self._validate_estimator() From a4e959cce853b178e36201bafafa709914e16e5f Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 13 Sep 2021 19:16:56 +0000 Subject: [PATCH 09/50] remove explicit class check for ClassifierMixin in InstanceHardnessThreshold --- .../_prototype_selection/_instance_hardness_threshold.py | 1 - 1 file changed, 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 9b82215ec..22a0df30c 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -118,7 +118,6 @@ def _validate_estimator(self, random_state): if ( self.estimator is not None - and isinstance(self.estimator, ClassifierMixin) and hasattr(self.estimator, "predict_proba") ): self.estimator_ = clone(self.estimator) From 65ae4fd965aff274033422b026f8c7c98ae661e3 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 13 Sep 2021 19:33:39 +0000 Subject: [PATCH 10/50] PEP 8 issue fix --- imblearn/over_sampling/_smote/filter.py | 1 - .../_prototype_selection/_instance_hardness_threshold.py | 2 +- .../under_sampling/_prototype_selection/_one_sided_selection.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index 7d83f72e8..3f4d41c93 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -15,7 +15,6 @@ from sklearn.utils import _safe_indexing from ..base import BaseOverSampler -from ...exceptions import raise_isinstance_error from ...utils import check_neighbors_object from ...utils import Substitution from ...utils._docstring import _n_jobs_docstring diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 22a0df30c..80fc84fbe 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -10,7 +10,7 @@ import numpy as np -from sklearn.base import ClassifierMixin, clone +from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble._base import _set_random_states from sklearn.model_selection import StratifiedKFold diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index dadbf001b..174ac9b7c 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -118,7 +118,6 @@ def _validate_estimator(self): else: self.estimator_ = clone(self.n_neighbors) - def _fit_resample(self, X, y): self._validate_estimator() From 5b76d49b395039406fbb2391ff0c557324d84df9 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 13 Sep 2021 19:36:01 +0000 Subject: [PATCH 11/50] PEP 8 issue fix - line break before operator --- .../_prototype_selection/_instance_hardness_threshold.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 80fc84fbe..8da46c2ea 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -117,8 +117,8 @@ def _validate_estimator(self, random_state): """Private function to create the classifier""" if ( - self.estimator is not None - and hasattr(self.estimator, "predict_proba") + self.estimator is not None and + hasattr(self.estimator, "predict_proba") ): self.estimator_ = clone(self.estimator) _set_random_states(self.estimator_, random_state) From 8284b70b866d5b6ab4dcaec2750a359b7fab89a8 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 13 Sep 2021 19:37:24 +0000 Subject: [PATCH 12/50] PEP 8 issue fix - no more line break before operator --- .../_prototype_selection/_instance_hardness_threshold.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 8da46c2ea..2e6a58852 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -116,10 +116,7 @@ def __init__( def _validate_estimator(self, random_state): """Private function to create the classifier""" - if ( - self.estimator is not None and - hasattr(self.estimator, "predict_proba") - ): + if self.estimator is not None and hasattr(self.estimator, "predict_proba"): self.estimator_ = clone(self.estimator) _set_random_states(self.estimator_, random_state) From e97ae36c41f08a33a5dfbe1fe2df64d7f2f9e84b Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 15 Sep 2021 17:10:30 +0000 Subject: [PATCH 13/50] Undo changes to _instance_hardness_threshold --- .../_prototype_selection/_instance_hardness_threshold.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py index 2e6a58852..9b82215ec 100644 --- a/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/_instance_hardness_threshold.py @@ -10,7 +10,7 @@ import numpy as np -from sklearn.base import clone +from sklearn.base import ClassifierMixin, clone from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble._base import _set_random_states from sklearn.model_selection import StratifiedKFold @@ -116,7 +116,11 @@ def __init__( def _validate_estimator(self, random_state): """Private function to create the classifier""" - if self.estimator is not None and hasattr(self.estimator, "predict_proba"): + if ( + self.estimator is not None + and isinstance(self.estimator, ClassifierMixin) + and hasattr(self.estimator, "predict_proba") + ): self.estimator_ = clone(self.estimator) _set_random_states(self.estimator_, random_state) From 495ec2781f774e2017db0d9c01474c467c213fc9 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Thu, 16 Sep 2021 18:41:22 +0000 Subject: [PATCH 14/50] revert OneSidedSelection changes --- .../_prototype_selection/_one_sided_selection.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py index 174ac9b7c..305abec0b 100644 --- a/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py +++ b/imblearn/under_sampling/_prototype_selection/_one_sided_selection.py @@ -115,8 +115,14 @@ def _validate_estimator(self): self.estimator_ = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) - else: + elif isinstance(self.n_neighbors, KNeighborsClassifier): self.estimator_ = clone(self.n_neighbors) + else: + raise ValueError( + f"`n_neighbors` has to be a int or an object" + f" inherited from KNeighborsClassifier." + f" Got {type(self.n_neighbors)} instead." + ) def _fit_resample(self, X, y): self._validate_estimator() From 10456f5a2cf776a0247318d05ec5d18c9e29cb36 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Fri, 24 Sep 2021 19:06:51 +0000 Subject: [PATCH 15/50] Undo changes to CondensedNearestNeighbour --- .../_prototype_selection/_condensed_nearest_neighbour.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py index 5d6391abd..738110cae 100644 --- a/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py +++ b/imblearn/under_sampling/_prototype_selection/_condensed_nearest_neighbour.py @@ -119,8 +119,14 @@ def _validate_estimator(self): self.estimator_ = KNeighborsClassifier( n_neighbors=self.n_neighbors, n_jobs=self.n_jobs ) - else: + elif isinstance(self.n_neighbors, KNeighborsClassifier): self.estimator_ = clone(self.n_neighbors) + else: + raise ValueError( + f"`n_neighbors` has to be a int or an object" + f" inhereited from KNeighborsClassifier." + f" Got {type(self.n_neighbors)} instead." + ) def _fit_resample(self, X, y): self._validate_estimator() From 93200e1980ea9f0cb273cc2bd6d2b0402ef22f22 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 29 Sep 2021 19:28:29 +0000 Subject: [PATCH 16/50] example NearestNeighbors test --- .../over_sampling/_smote/tests/test_smote.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/imblearn/over_sampling/_smote/tests/test_smote.py b/imblearn/over_sampling/_smote/tests/test_smote.py index 3f9ed0b40..97cc0cc85 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote.py +++ b/imblearn/over_sampling/_smote/tests/test_smote.py @@ -164,3 +164,43 @@ def test_smote_m_neighbors(smote): _ = smote.fit_resample(X, Y) assert smote.nn_k_.n_neighbors == 6 assert smote.nn_m_.n_neighbors == 11 + + +def test_sample_cuml_with_nn(): + cuml = pytest.importorskip("cuml") + nn_k = cuml.neighbors.NearestNeighbors(n_neighbors=6) + smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) + X_resampled, y_resampled = smote.fit_resample(X, Y) + X_gt = np.array( + [ + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + [0.29307743, -0.14670439], + [0.84976473, -0.15570176], + [0.61319159, -0.11571668], + [0.66052536, -0.28246517], + ] + ) + y_gt = np.array( + [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0] + ) + assert_allclose(X_resampled, X_gt, rtol=R_TOL) + assert_array_equal(y_resampled, y_gt) \ No newline at end of file From f10405783c7463fe52981d9651072a48a784aa12 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 29 Sep 2021 19:55:00 +0000 Subject: [PATCH 17/50] Use sklearn.base.clone to validate NN object and throw error --- imblearn/utils/_validation.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 217bdc052..d30a27b60 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -67,14 +67,6 @@ def _transfrom_one(self, array, props): return ret -def _is_neighbors_object(kneighbors_estimator): - neighbors_attributes = [ - "kneighbors", - "kneighbors_graph" - ] - return all(hasattr(kneighbors_estimator, attr) for attr in neighbors_attributes) - - def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. @@ -101,10 +93,8 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """ if isinstance(nn_object, Integral): return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) - elif _is_neighbors_object(nn_object): - return clone(nn_object) else: - raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) + return clone(nn_object) def _count_class_sample(y): From b82e4d95279b77f310026b24d8429772b05a531a Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 29 Sep 2021 20:18:01 +0000 Subject: [PATCH 18/50] undo last commit, and raise nn_object TypeError --- imblearn/utils/_validation.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index d30a27b60..3818327ff 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -67,6 +67,14 @@ def _transfrom_one(self, array, props): return ret +def _is_neighbors_object(kneighbors_estimator): + neighbors_attributes = [ + "kneighbors", + "kneighbors_graph" + ] + return all(hasattr(kneighbors_estimator, attr) for attr in neighbors_attributes) + + def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. @@ -93,8 +101,10 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """ if isinstance(nn_object, Integral): return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) - else: + elif _is_neighbors_object(nn_object): return clone(nn_object) + else: + raise TypeError("nn_object must be NearestNeighbors object or int") def _count_class_sample(y): From 70b677844aeed62cc921cc4d932a9dbcaa628e71 Mon Sep 17 00:00:00 2001 From: sft-managed Date: Wed, 29 Sep 2021 21:08:34 +0000 Subject: [PATCH 19/50] remove unused imports --- imblearn/utils/_validation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 3818327ff..a842152c8 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -12,13 +12,10 @@ import numpy as np from sklearn.base import clone -from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils import column_or_1d from sklearn.utils.multiclass import type_of_target -from ..exceptions import raise_isinstance_error - SAMPLING_KIND = ( "over-sampling", "under-sampling", From c67c775920cbfa9d3c064fef07fbeacd658e6e5f Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 4 Oct 2021 21:22:30 +0000 Subject: [PATCH 20/50] Add test for cuml ADASYN --- imblearn/over_sampling/tests/test_adasyn.py | 45 ++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index 819682e2e..3a50daf34 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -128,10 +128,53 @@ def test_ada_fit_resample_nn_obj(): "adasyn_params, err_msg", [ ({"sampling_strategy": {0: 9, 1: 12}}, "No samples will be generated.",), - ({"n_neighbors": "rnd"}, "has to be one of"), + ({"n_neighbors": "rnd"}, "NearestNeighbors object or int"), ], ) def test_adasyn_error(adasyn_params, err_msg): adasyn = ADASYN(**adasyn_params) with pytest.raises(ValueError, match=err_msg): adasyn.fit_resample(X, Y) + + +def test_ada_fit_resample_cuml_nn_obj(): + cuml = pytest.importorskip("cuml") + nn = cuml.neighbors.NearestNeighbors(n_neighbors=2) + ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) + X_resampled, y_resampled = ada.fit_resample(X, Y) + X_gt = np.array( + [ + [ 0.11622591, -0.0317206], + [ 0.77481731, 0.60935141], + [ 1.25192108, -0.22367336], + [ 0.53366841, -0.30312976], + [ 1.52091956, -0.49283504], + [-0.28162401, -2.10400981], + [ 0.83680821, 1.72827342], + [ 0.3084254 , 0.33299982], + [ 0.70472253, -0.73309052], + [ 0.28893132, -0.38761769], + [ 1.15514042, 0.0129463], + [ 0.88407872, 0.35454207], + [ 1.31301027, -0.92648734], + [-1.11515198, -0.93689695], + [-0.18410027, -0.45194484], + [ 0.9281014 , 0.53085498], + [-0.14374509, 0.27370049], + [-0.41635887, -0.38299653], + [ 0.08711622, 0.93259929], + [ 1.70580611, -0.11219234], + [ 0.34532399, -0.18067361], + [ 1.44430593, -0.41617493], + [ 0.28204936, -0.13953426], + [ 1.08450984, 0.03948221], + [-0.19072677, -0.2341768] + ] + ) + + y_gt = np.array( + [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0] + ) + + assert_allclose(X_resampled, X_gt, rtol=R_TOL) + assert_array_equal(y_resampled, y_gt) \ No newline at end of file From 010f4d5ae16f45869a81855847b8e071bfe7295c Mon Sep 17 00:00:00 2001 From: sft-managed Date: Mon, 4 Oct 2021 21:26:41 +0000 Subject: [PATCH 21/50] Updated check_neighbors_object docstring and error type --- imblearn/utils/_validation.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index a842152c8..0ce46f470 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -75,10 +75,10 @@ def _is_neighbors_object(kneighbors_estimator): def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): """Check the objects is consistent to be a NN. - Several methods in imblearn relies on NN. Until version 0.4, these - objects can be passed at initialisation as an integer or a - KNeighborsMixin. After only KNeighborsMixin will be accepted. This - utility allows for type checking and raise if the type is wrong. + Several methods in imblearn relies on NN. These objects can + be passed at initialisation as an integer or as an object + that has KNeighborsMixin-like attributes. This utility will + create or clone said object, ensuring it is KNeighbors-like. Parameters ---------- @@ -101,7 +101,7 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): elif _is_neighbors_object(nn_object): return clone(nn_object) else: - raise TypeError("nn_object must be NearestNeighbors object or int") + raise ValueError("nn_object must be NearestNeighbors object or int") def _count_class_sample(y): From 178d0f05bb78061b54b15c45addfb89e9875722d Mon Sep 17 00:00:00 2001 From: sft-managed Date: Tue, 5 Oct 2021 14:37:25 +0000 Subject: [PATCH 22/50] Updated tests --- .../over_sampling/_smote/tests/test_smote.py | 42 +++++++++---------- .../tests/test_cluster_centroids.py | 1 - .../tests/test_edited_nearest_neighbours.py | 2 +- .../tests/test_nearmiss.py | 4 +- .../tests/test_neighbourhood_cleaning_rule.py | 2 +- imblearn/utils/tests/test_validation.py | 2 +- 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/imblearn/over_sampling/_smote/tests/test_smote.py b/imblearn/over_sampling/_smote/tests/test_smote.py index 97cc0cc85..abce2c5ca 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote.py +++ b/imblearn/over_sampling/_smote/tests/test_smote.py @@ -168,35 +168,35 @@ def test_smote_m_neighbors(smote): def test_sample_cuml_with_nn(): cuml = pytest.importorskip("cuml") - nn_k = cuml.neighbors.NearestNeighbors(n_neighbors=6) + nn_k = cuml.neighbors.NearestNeighbors(n_neighbors=2) smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ - [0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], + [ 0.11622591, -0.0317206], + [ 0.77481731, 0.60935141], + [ 1.25192108, -0.22367336], + [ 0.53366841, -0.30312976], + [ 1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], + [ 0.83680821, 1.72827342], + [ 0.3084254 , 0.33299982], + [ 0.70472253, -0.73309052], + [ 0.28893132, -0.38761769], + [ 1.15514042, 0.0129463], + [ 0.88407872, 0.35454207], + [ 1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], + [ 0.9281014 , 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234], - [0.29307743, -0.14670439], - [0.84976473, -0.15570176], - [0.61319159, -0.11571668], - [0.66052536, -0.28246517], + [ 0.08711622, 0.93259929], + [ 1.70580611, -0.11219234], + [ 1.10580062, 0.00601499], + [ 1.60506454, -0.31959815], + [ 1.40109204, -0.74276846], + [ 0.38584956, -0.20702218] ] ) y_gt = np.array( diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 8148e2fdb..afcab938e 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -99,7 +99,6 @@ def test_fit_hard_voting(): @pytest.mark.parametrize( "cluster_centroids_params, err_msg", [ - ({"estimator": "rnd"}, "has to be a KMeans clustering"), ({"voting": "unknown"}, "needs to be one of"), ], ) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 44999ddb5..1169d17b8 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -126,7 +126,7 @@ def test_enn_fit_resample_with_nn_object(): def test_enn_not_good_object(): nn = "rnd" enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") - with pytest.raises(ValueError, match="has to be one of"): + with pytest.raises(ValueError, match="NearestNeighbors object or int"): enn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index f2cab39c8..ae49fabe8 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -39,14 +39,14 @@ "nearmiss_params, err_msg", [ ({"version": 1000}, "must be 1, 2 or 3"), - ({"version": 1, "n_neighbors": "rnd"}, "has to be one of"), + ({"version": 1, "n_neighbors": "rnd"}, "NearestNeighbors object or int"), ( { "version": 3, "n_neighbors": NearestNeighbors(n_neighbors=3), "n_neighbors_ver3": "rnd", }, - "has to be one of", + "NearestNeighbors object or int", ), ], ) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py index fc84cb017..2d5db1b81 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -37,7 +37,7 @@ [ ({"threshold_cleaning": -10}, "value between 0 and 1"), ({"threshold_cleaning": 10}, "value between 0 and 1"), - ({"n_neighbors": "rnd"}, "has to be one of"), + ({"n_neighbors": "rnd"}, "NearestNeighbors object or int"), ], ) def test_ncr_error(ncr_params, err_msg): diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 30c4a932f..e324e2e99 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -37,7 +37,7 @@ def test_check_neighbors_object(): estimator_cloned = check_neighbors_object(name, estimator) assert estimator.n_neighbors == estimator_cloned.n_neighbors n_neighbors = "rnd" - with pytest.raises(ValueError, match="has to be one of"): + with pytest.raises(ValueError, match="NearestNeighbors object or int"): check_neighbors_object(name, n_neighbors) From 8889cfde97521e1c96ee3d74d1f78da505d6132f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Dec 2021 18:51:37 +0100 Subject: [PATCH 23/50] duck-typing svm --- imblearn/over_sampling/_smote/filter.py | 8 ++++++++ imblearn/over_sampling/_smote/tests/test_svm_smote.py | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index f32ee5fb1..80b5cad50 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -277,6 +277,8 @@ class SVMSMOTE(BaseSMOTE): svm_estimator : estimator object, default=SVC() A parametrized :class:`~sklearn.svm.SVC` classifier can be passed. + A scikit-learn compatible estimator can be passed but it is required + to expose a `support_` fitted attribute. out_step : float, default=0.5 Step size when extrapolating. @@ -400,6 +402,12 @@ def _fit_resample(self, X, y): X_class = _safe_indexing(X, target_class_indices) self.svm_estimator_.fit(X, y) + if not hasattr(self.svm_estimator_, "support_"): + raise RuntimeError( + "`svm_estimator` is required to exposed a `support_` fitted " + "attribute. Such estimator belongs to the familly of Support " + "Vector Machine." + ) support_index = self.svm_estimator_.support_[ y[self.svm_estimator_.support_] == class_sample ] diff --git a/imblearn/over_sampling/_smote/tests/test_svm_smote.py b/imblearn/over_sampling/_smote/tests/test_svm_smote.py index 578ceccde..c6ae61d0c 100644 --- a/imblearn/over_sampling/_smote/tests/test_svm_smote.py +++ b/imblearn/over_sampling/_smote/tests/test_svm_smote.py @@ -1,6 +1,7 @@ import pytest import numpy as np +from sklearn.linear_model import LogisticRegression from sklearn.neighbors import NearestNeighbors from sklearn.svm import SVC @@ -54,3 +55,12 @@ def test_svm_smote(data): assert_allclose(X_res_1, X_res_2) assert_array_equal(y_res_1, y_res_2) + + +def test_svm_smote_not_svm(data): + """Check that we raise a proper error if passing an estimator that does not + expose a `support_` fitted attribute.""" + + err_msg = "`svm_estimator` is required to exposed a `support_` fitted attribute." + with pytest.raises(RuntimeError, match=err_msg): + SVMSMOTE(svm_estimator=LogisticRegression()).fit_resample(*data) From 5e875a0b846707c01031acac4fe9b1071f08dcee Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Dec 2021 19:17:48 +0100 Subject: [PATCH 24/50] TST add couple of tests --- doc/developers_utils.rst | 3 +- imblearn/utils/_validation.py | 42 +++++++++++++++++-------- imblearn/utils/tests/test_validation.py | 20 +++++++++++- 3 files changed, 50 insertions(+), 15 deletions(-) diff --git a/doc/developers_utils.rst b/doc/developers_utils.rst index e7b9bc478..5a1dc5559 100644 --- a/doc/developers_utils.rst +++ b/doc/developers_utils.rst @@ -29,7 +29,8 @@ which accepts arrays, matrices, or sparse matrices as arguments, the following should be used when applicable. - :func:`check_neighbors_object`: Check the objects is consistent to be a NN. -- :func:`check_target_type`: Check the target types to be conform to the current sam plers. +- :func:`check_target_type`: Check the target types to be conform to the current + samplers. - :func:`check_sampling_strategy`: Checks that sampling target is onsistent with the type and return a dictionary containing each targeted class with its corresponding number of pixel. diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 4c16e6e19..c3fa53357 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -64,28 +64,40 @@ def _transfrom_one(self, array, props): return ret -def _is_neighbors_object(kneighbors_estimator): - neighbors_attributes = [ - "kneighbors", - "kneighbors_graph" - ] - return all(hasattr(kneighbors_estimator, attr) for attr in neighbors_attributes) +def _is_neighbors_object(estimator): + """Check that the estimator exposes a KNeighborsMixin-like API. + + A KNeighborsMixin-like API exposes the following methods: (i) `kneighbors`, + (ii) `kneighbors_graph`. + + Parameters + ---------- + estimator : object + A scikit-learn compatible estimator. + + Returns + ------- + is_neighbors_object : bool + True if the estimator exposes a KNeighborsMixin-like API. + """ + neighbors_attributes = ["kneighbors", "kneighbors_graph"] + return all(hasattr(estimator, attr) for attr in neighbors_attributes) def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): - """Check the objects is consistent to be a NN. + """Check the objects is consistent to be a k nearest neighbors. - Several methods in imblearn relies on NN. These objects can - be passed at initialisation as an integer or as an object - that has KNeighborsMixin-like attributes. This utility will - create or clone said object, ensuring it is KNeighbors-like. + Several methods in `imblearn` relies on k nearest neighbors. These objects + can be passed at initialisation as an integer or as an object that has + KNeighborsMixin-like attributes. This utility will create or clone said + object, ensuring it is KNeighbors-like. Parameters ---------- nn_name : str The name associated to the object to raise an error if needed. - nn_object : int or KNeighborsMixin, + nn_object : int or KNeighborsMixin The object to be checked. additional_neighbor : int, default=0 @@ -101,7 +113,11 @@ def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): elif _is_neighbors_object(nn_object): return clone(nn_object) else: - raise ValueError("nn_object must be NearestNeighbors object or int") + raise ValueError( + f"{nn_name} must be an interger or an object compatible with the " + "KNeighborsMixin API of scikit-learn (i.e. implementing `kneighbors` " + "method)." + ) def _count_class_sample(y): diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index dd92c728c..284abb3a6 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -9,6 +9,7 @@ import pytest import numpy as np +from sklearn.base import BaseEstimator from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_array_equal @@ -24,6 +25,16 @@ binary_target = np.array([1] * 25 + [0] * 100) +class KNNLikeEstimator(BaseEstimator): + """A class exposing the same KNeighborsMixin API than KNeighborsClassifier.""" + + def kneighbors(self, X): + return np.ones((len(X), 1)) + + def kneighbors_graph(self, X): + return np.ones((len(X), 1)) + + def test_check_neighbors_object(): name = "n_neighbors" n_neighbors = 1 @@ -36,8 +47,15 @@ def test_check_neighbors_object(): estimator = NearestNeighbors(n_neighbors=n_neighbors) estimator_cloned = check_neighbors_object(name, estimator) assert estimator.n_neighbors == estimator_cloned.n_neighbors + estimator = KNNLikeEstimator() + estimator_cloned = check_neighbors_object(name, estimator) + assert isinstance(estimator_cloned, KNNLikeEstimator) n_neighbors = "rnd" - with pytest.raises(ValueError, match="NearestNeighbors object or int"): + err_msg = ( + "n_neighbors must be an interger or an object compatible with the " + "KNeighborsMixin API of scikit-learn" + ) + with pytest.raises(ValueError, match=err_msg): check_neighbors_object(name, n_neighbors) From 9545172d1dde7c236abab05a3abbe244829f3a29 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Dec 2021 22:54:53 +0100 Subject: [PATCH 25/50] better error message with duck-typing --- .../over_sampling/_smote/tests/test_smote.py | 42 +++++++-------- imblearn/over_sampling/tests/test_adasyn.py | 52 ++++++++++--------- .../_cluster_centroids.py | 13 ++++- .../tests/test_cluster_centroids.py | 30 +++++++++++ 4 files changed, 91 insertions(+), 46 deletions(-) diff --git a/imblearn/over_sampling/_smote/tests/test_smote.py b/imblearn/over_sampling/_smote/tests/test_smote.py index abce2c5ca..1cda3e5b8 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote.py +++ b/imblearn/over_sampling/_smote/tests/test_smote.py @@ -173,34 +173,34 @@ def test_sample_cuml_with_nn(): X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array( [ - [ 0.11622591, -0.0317206], - [ 0.77481731, 0.60935141], - [ 1.25192108, -0.22367336], - [ 0.53366841, -0.30312976], - [ 1.52091956, -0.49283504], + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [ 0.83680821, 1.72827342], - [ 0.3084254 , 0.33299982], - [ 0.70472253, -0.73309052], - [ 0.28893132, -0.38761769], - [ 1.15514042, 0.0129463], - [ 0.88407872, 0.35454207], - [ 1.31301027, -0.92648734], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], - [ 0.9281014 , 0.53085498], - [-0.14374509, 0.27370049], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [ 0.08711622, 0.93259929], - [ 1.70580611, -0.11219234], - [ 1.10580062, 0.00601499], - [ 1.60506454, -0.31959815], - [ 1.40109204, -0.74276846], - [ 0.38584956, -0.20702218] + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + [1.10580062, 0.00601499], + [1.60506454, -0.31959815], + [1.40109204, -0.74276846], + [0.38584956, -0.20702218], ] ) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0] ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) \ No newline at end of file + assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index aec72d955..4d7bd38fd 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -131,7 +131,11 @@ def test_ada_fit_resample_nn_obj(): {"sampling_strategy": {0: 9, 1: 12}}, "No samples will be generated.", ), - ({"n_neighbors": "rnd"}, "NearestNeighbors object or int"), + ( + {"n_neighbors": "rnd"}, + "n_neighbors must be an interger or an object compatible with the " + "KNeighborsMixin API of scikit-learn", + ), ], ) def test_adasyn_error(adasyn_params, err_msg): @@ -145,33 +149,33 @@ def test_ada_fit_resample_cuml_nn_obj(): nn = cuml.neighbors.NearestNeighbors(n_neighbors=2) ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) X_resampled, y_resampled = ada.fit_resample(X, Y) - X_gt = np.array( + X_gt = np.array( [ - [ 0.11622591, -0.0317206], - [ 0.77481731, 0.60935141], - [ 1.25192108, -0.22367336], - [ 0.53366841, -0.30312976], - [ 1.52091956, -0.49283504], + [0.11622591, -0.0317206], + [0.77481731, 0.60935141], + [1.25192108, -0.22367336], + [0.53366841, -0.30312976], + [1.52091956, -0.49283504], [-0.28162401, -2.10400981], - [ 0.83680821, 1.72827342], - [ 0.3084254 , 0.33299982], - [ 0.70472253, -0.73309052], - [ 0.28893132, -0.38761769], - [ 1.15514042, 0.0129463], - [ 0.88407872, 0.35454207], - [ 1.31301027, -0.92648734], + [0.83680821, 1.72827342], + [0.3084254, 0.33299982], + [0.70472253, -0.73309052], + [0.28893132, -0.38761769], + [1.15514042, 0.0129463], + [0.88407872, 0.35454207], + [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], - [ 0.9281014 , 0.53085498], - [-0.14374509, 0.27370049], + [0.9281014, 0.53085498], + [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], - [ 0.08711622, 0.93259929], - [ 1.70580611, -0.11219234], - [ 0.34532399, -0.18067361], - [ 1.44430593, -0.41617493], - [ 0.28204936, -0.13953426], - [ 1.08450984, 0.03948221], - [-0.19072677, -0.2341768] + [0.08711622, 0.93259929], + [1.70580611, -0.11219234], + [0.34532399, -0.18067361], + [1.44430593, -0.41617493], + [0.28204936, -0.13953426], + [1.08450984, 0.03948221], + [-0.19072677, -0.2341768], ] ) @@ -180,4 +184,4 @@ def test_ada_fit_resample_cuml_nn_obj(): ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) \ No newline at end of file + assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py index ee3f77cca..b7f56274e 100644 --- a/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/_cluster_centroids.py @@ -49,7 +49,8 @@ class ClusterCentroids(BaseUnderSampler): {random_state} estimator : estimator object, default=None - Pass a :class:`~sklearn.cluster.KMeans` estimator. By default, it will + A scikit-learn compatible clustering method that exposes a `n_clusters` + parameter and a `cluster_centers_` fitted attribute. By default, it will be a default :class:`~sklearn.cluster.KMeans` estimator. voting : {{"hard", "soft", "auto"}}, default='auto' @@ -143,6 +144,11 @@ def _validate_estimator(self): self.estimator_ = KMeans(random_state=self.random_state) else: self.estimator_ = clone(self.estimator) + if "n_clusters" not in self.estimator_.get_params(): + raise ValueError( + "`estimator` should be a clustering estimator exposing a parameter" + " `n_clusters` and a fitted parameter `cluster_centers_`." + ) def _generate_sample(self, X, y, centroids, target_class): if self.voting_ == "hard": @@ -183,6 +189,11 @@ def _fit_resample(self, X, y): n_samples = self.sampling_strategy_[target_class] self.estimator_.set_params(**{"n_clusters": n_samples}) self.estimator_.fit(_safe_indexing(X, target_class_indices)) + if not hasattr(self.estimator_, "cluster_centers_"): + raise RuntimeError( + "`estimator` should be a clustering estimator exposing a " + "fitted parameter `cluster_centers_`." + ) X_new, y_new = self._generate_sample( _safe_indexing(X, target_class_indices), _safe_indexing(y, target_class_indices), diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index f71c90d4d..bb8c9bd0c 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -5,6 +5,8 @@ import numpy as np from scipy import sparse +from sklearn.base import BaseEstimator +from sklearn.linear_model import LogisticRegression from sklearn.cluster import KMeans from sklearn.datasets import make_classification @@ -151,3 +153,31 @@ def test_cluster_centroids_hard_target_class(): for minority_sample in X_minority_class ] assert sum(sample_from_minority_in_majority) == 0 + + +class FakeCluster(BaseEstimator): + """Class that mimics a cluster that does not expose `cluster_centers_`.""" + + def __init__(self, n_clusters=1): + self.n_clusters = n_clusters + + def fit(self, X, y=None): + return self + + +def test_cluster_centroids_error_estimator(): + """Check that an error is raised when estimator does not have a cluster API.""" + + err_msg = ( + "`estimator` should be a clustering estimator exposing a parameter " + "`n_clusters` and a fitted parameter `cluster_centers_`." + ) + with pytest.raises(ValueError, match=err_msg): + ClusterCentroids(estimator=LogisticRegression()).fit_resample(X, Y) + + err_msg = ( + "`estimator` should be a clustering estimator exposing a fitted parameter " + "`cluster_centers_`." + ) + with pytest.raises(RuntimeError, match=err_msg): + ClusterCentroids(estimator=FakeCluster()).fit_resample(X, Y) From 29a414b6afac3312048e2d112f92f05afb30df25 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Dec 2021 22:59:36 +0100 Subject: [PATCH 26/50] iter --- .../tests/test_edited_nearest_neighbours.py | 6 +++++- .../_prototype_selection/tests/test_nearmiss.py | 7 +++++-- .../tests/test_neighbourhood_cleaning_rule.py | 5 ++++- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py index 4a66b9e4e..50680f632 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_edited_nearest_neighbours.py @@ -126,7 +126,11 @@ def test_enn_fit_resample_with_nn_object(): def test_enn_not_good_object(): nn = "rnd" enn = EditedNearestNeighbours(n_neighbors=nn, kind_sel="mode") - with pytest.raises(ValueError, match="NearestNeighbors object or int"): + err_msg = ( + "n_neighbors must be an interger or an object compatible with the " + "KNeighborsMixin API of scikit-learn" + ) + with pytest.raises(ValueError, match=err_msg): enn.fit_resample(X, Y) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py index 7317121af..6a3fc04fb 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_nearmiss.py @@ -39,14 +39,17 @@ "nearmiss_params, err_msg", [ ({"version": 1000}, "must be 1, 2 or 3"), - ({"version": 1, "n_neighbors": "rnd"}, "NearestNeighbors object or int"), + ( + {"version": 1, "n_neighbors": "rnd"}, + "n_neighbors must be an interger or an object compatible", + ), ( { "version": 3, "n_neighbors": NearestNeighbors(n_neighbors=3), "n_neighbors_ver3": "rnd", }, - "NearestNeighbors object or int", + "n_neighbors_ver3 must be an interger or an object compatible", ), ], ) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py index 2d5db1b81..78119dde7 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_neighbourhood_cleaning_rule.py @@ -37,7 +37,10 @@ [ ({"threshold_cleaning": -10}, "value between 0 and 1"), ({"threshold_cleaning": 10}, "value between 0 and 1"), - ({"n_neighbors": "rnd"}, "NearestNeighbors object or int"), + ( + {"n_neighbors": "rnd"}, + "n_neighbors must be an interger or an object compatible", + ), ], ) def test_ncr_error(ncr_params, err_msg): From 12991ba83fd89b87d3f2f9209bf4c6113cbb07f4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Dec 2021 23:26:41 +0100 Subject: [PATCH 27/50] CI let's try a run on CircleCI with cuML --- azure-pipelines.yml | 9 +++++++++ build_tools/azure/install.sh | 10 ++++++++++ imblearn/_min_dependencies.py | 6 ++++++ 3 files changed, 25 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d614ff1b3..d5c5e1fa6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -207,6 +207,15 @@ jobs: TEST_DOCS: 'true' TEST_DOCSTRINGS: 'false' # it is going to fail because of scikit-learn inheritance CHECK_WARNINGS: 'true' + pylatest_conda_cuml: + DISTRIB: 'conda-cuml' + CONDA_CHANNEL: 'conda-forge' + PYTHON_VERSION: '3.8' + BLAZINGSQL_VERSION: 'min' + CUML_VERSION: 'min' + CUDATOOLKIT_VERSION: 'min' + TEST_DOCS: 'true' + TEST_DOCSTRINGS: 'true' - template: build_tools/azure/posix-docker.yml parameters: diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 4fa68fbaa..da05af3be 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -104,6 +104,16 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then TO_INSTALL="$TO_INSTALL $(get_dep keras $KERAS_VERSION)" make_conda $TO_INSTALL +elif [[ "$DISTRIB" == "conda-cuml" ]]; then + TO_INSTALL="-c rapidsai -c nvidia python=$PYTHON_VERSION" + TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" + TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" + TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)" + TO_INSTALL="$TO_INSTALL $(get_dep blazingsql $BLAZINGSQL_VERSION)" + TO_INSTALL="$TO_INSTALL $(get_dep cuml $CUML_VERSION)" + TO_INSTALL="$TO_INSTALL $(get_dep cudatoolkit $CUDATOOLKIT_VERSION)" + make_conda $TO_INSTALL + elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip diff --git a/imblearn/_min_dependencies.py b/imblearn/_min_dependencies.py index 11849ab85..2437cc5e8 100644 --- a/imblearn/_min_dependencies.py +++ b/imblearn/_min_dependencies.py @@ -18,6 +18,9 @@ JOBLIB_MIN_VERSION = "0.11" THREADPOOLCTL_MIN_VERSION = "2.0.0" PYTEST_MIN_VERSION = "5.0.1" +CUML_MIN_VERSION = "21.10" +BLAZINGSQL_MIN_VERSION = "21.10" +CUDATOOLKIT_MIN_VERSION = "11.0" # 'build' and 'install' is included to have structured metadata for CI. @@ -32,6 +35,9 @@ "pandas": (PANDAS_MIN_VERSION, "optional, docs, examples, tests"), "tensorflow": (TENSORFLOW_MIN_VERSION, "optional, docs, examples, tests"), "keras": (KERAS_MIN_VERSION, "optional, docs, examples, tests"), + "cuml": (CUML_MIN_VERSION, "optional, docs, examples, tests"), + "blazingsql": (BLAZINGSQL_MIN_VERSION, "optional, docs, examples, tests"), + "cudatoolkit": (CUDATOOLKIT_MIN_VERSION, "optional, docs, examples, tests"), "matplotlib": ("2.2.3", "docs, examples"), "seaborn": ("0.9.0", "docs, examples"), "memory_profiler": ("0.57.0", "docs"), From e24ee066b19666552ba1680d666c0ba7c958b69d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Dec 2021 23:38:02 +0100 Subject: [PATCH 28/50] iter --- build_tools/azure/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index da05af3be..04f9474bc 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -105,7 +105,7 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then make_conda $TO_INSTALL elif [[ "$DISTRIB" == "conda-cuml" ]]; then - TO_INSTALL="-c rapidsai -c nvidia python=$PYTHON_VERSION" + TO_INSTALL="-c $CONDA_CHANNEL -c rapidsai -c nvidia python=$PYTHON_VERSION" TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)" From 525002fc27f09afc6fdce47bdd17932cbb107994 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Dec 2021 23:48:07 +0100 Subject: [PATCH 29/50] iter --- build_tools/azure/install.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 04f9474bc..56f3492df 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -106,14 +106,13 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then elif [[ "$DISTRIB" == "conda-cuml" ]]; then TO_INSTALL="-c $CONDA_CHANNEL -c rapidsai -c nvidia python=$PYTHON_VERSION" - TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep blazingsql $BLAZINGSQL_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep cuml $CUML_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep cudatoolkit $CUDATOOLKIT_VERSION)" make_conda $TO_INSTALL + python -m pip install numpy scipy scikit-learn + elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip From 189f0e932a34569dc07c60b51b737ec7aa12acff Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 7 Dec 2021 23:58:09 +0100 Subject: [PATCH 30/50] iter --- azure-pipelines.yml | 2 +- build_tools/azure/install.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d5c5e1fa6..e76edc681 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -209,7 +209,7 @@ jobs: CHECK_WARNINGS: 'true' pylatest_conda_cuml: DISTRIB: 'conda-cuml' - CONDA_CHANNEL: 'conda-forge' + CONDA_CHANNEL: 'defaults' PYTHON_VERSION: '3.8' BLAZINGSQL_VERSION: 'min' CUML_VERSION: 'min' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 56f3492df..be4d95368 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -105,7 +105,7 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then make_conda $TO_INSTALL elif [[ "$DISTRIB" == "conda-cuml" ]]; then - TO_INSTALL="-c $CONDA_CHANNEL -c rapidsai -c nvidia python=$PYTHON_VERSION" + TO_INSTALL="-c rapidsai -c nvidia -c conda-forge python=$PYTHON_VERSION" TO_INSTALL="$TO_INSTALL $(get_dep blazingsql $BLAZINGSQL_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep cuml $CUML_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep cudatoolkit $CUDATOOLKIT_VERSION)" From 2cbe273010f0740c294106250b693ead88299f39 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 00:05:37 +0100 Subject: [PATCH 31/50] iter --- build_tools/azure/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index be4d95368..482f294b2 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -108,7 +108,7 @@ elif [[ "$DISTRIB" == "conda-cuml" ]]; then TO_INSTALL="-c rapidsai -c nvidia -c conda-forge python=$PYTHON_VERSION" TO_INSTALL="$TO_INSTALL $(get_dep blazingsql $BLAZINGSQL_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep cuml $CUML_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep cudatoolkit $CUDATOOLKIT_VERSION)" + TO_INSTALL="$TO_INSTALL cudatoolkit" make_conda $TO_INSTALL python -m pip install numpy scipy scikit-learn From cc7fae9450aabd86babb491f87e35f8e226d6957 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 00:16:37 +0100 Subject: [PATCH 32/50] iter --- build_tools/azure/install.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 482f294b2..1d01fc79b 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -6,6 +6,7 @@ set -x UNAMESTR=`uname` make_conda() { + conda update -yq conda TO_INSTALL="$@" if [[ "$DISTRIB" == *"mamba"* ]]; then mamba create -n $VIRTUALENV --yes $TO_INSTALL @@ -106,13 +107,14 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then elif [[ "$DISTRIB" == "conda-cuml" ]]; then TO_INSTALL="-c rapidsai -c nvidia -c conda-forge python=$PYTHON_VERSION" + TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" + TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" + TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep blazingsql $BLAZINGSQL_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep cuml $CUML_VERSION)" - TO_INSTALL="$TO_INSTALL cudatoolkit" + TO_INSTALL="$TO_INSTALL $(get_dep cudatoolkit $CUDATOOLKIT_VERSION)" make_conda $TO_INSTALL - python -m pip install numpy scipy scikit-learn - elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip From 29e461970167d30b4bccad359a4c323f21152205 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 00:35:34 +0100 Subject: [PATCH 33/50] ITER --- azure-pipelines.yml | 2 +- build_tools/azure/install.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e76edc681..b48c1e2d0 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -208,7 +208,7 @@ jobs: TEST_DOCSTRINGS: 'false' # it is going to fail because of scikit-learn inheritance CHECK_WARNINGS: 'true' pylatest_conda_cuml: - DISTRIB: 'conda-cuml' + DISTRIB: 'mamba-cuml' CONDA_CHANNEL: 'defaults' PYTHON_VERSION: '3.8' BLAZINGSQL_VERSION: 'min' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 1d01fc79b..5db9fc64b 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -105,7 +105,7 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then TO_INSTALL="$TO_INSTALL $(get_dep keras $KERAS_VERSION)" make_conda $TO_INSTALL -elif [[ "$DISTRIB" == "conda-cuml" ]]; then +elif [[ "$DISTRIB" == "mamba-cuml" ]]; then TO_INSTALL="-c rapidsai -c nvidia -c conda-forge python=$PYTHON_VERSION" TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" From a098e84d9015ccd1f510cf43fe384452057f798f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 00:39:16 +0100 Subject: [PATCH 34/50] iter --- build_tools/azure/install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 5db9fc64b..60e3073cc 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -9,6 +9,7 @@ make_conda() { conda update -yq conda TO_INSTALL="$@" if [[ "$DISTRIB" == *"mamba"* ]]; then + conda install mamba mamba create -n $VIRTUALENV --yes $TO_INSTALL else conda config --show From 0aa328ecedfc5741d70b51144e162c386611e273 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 00:46:09 +0100 Subject: [PATCH 35/50] iter --- build_tools/azure/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 60e3073cc..415bd9736 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -9,7 +9,7 @@ make_conda() { conda update -yq conda TO_INSTALL="$@" if [[ "$DISTRIB" == *"mamba"* ]]; then - conda install mamba + conda install mamba -c conda-forge mamba create -n $VIRTUALENV --yes $TO_INSTALL else conda config --show From 8cce474824bc632fa7aed43304003f15b3c2ac47 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 00:54:24 +0100 Subject: [PATCH 36/50] dbg --- build_tools/azure/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 415bd9736..fca03ee7c 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -9,7 +9,7 @@ make_conda() { conda update -yq conda TO_INSTALL="$@" if [[ "$DISTRIB" == *"mamba"* ]]; then - conda install mamba -c conda-forge + conda install -yq mamba -c conda-forge mamba create -n $VIRTUALENV --yes $TO_INSTALL else conda config --show From 8d4ff31b760fa4309919c85b6c26756d2c3cd8f4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 01:04:49 +0100 Subject: [PATCH 37/50] dbg --- azure-pipelines.yml | 2 +- build_tools/azure/install.sh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b48c1e2d0..e76edc681 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -208,7 +208,7 @@ jobs: TEST_DOCSTRINGS: 'false' # it is going to fail because of scikit-learn inheritance CHECK_WARNINGS: 'true' pylatest_conda_cuml: - DISTRIB: 'mamba-cuml' + DISTRIB: 'conda-cuml' CONDA_CHANNEL: 'defaults' PYTHON_VERSION: '3.8' BLAZINGSQL_VERSION: 'min' diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index fca03ee7c..1d01fc79b 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -9,7 +9,6 @@ make_conda() { conda update -yq conda TO_INSTALL="$@" if [[ "$DISTRIB" == *"mamba"* ]]; then - conda install -yq mamba -c conda-forge mamba create -n $VIRTUALENV --yes $TO_INSTALL else conda config --show @@ -106,7 +105,7 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then TO_INSTALL="$TO_INSTALL $(get_dep keras $KERAS_VERSION)" make_conda $TO_INSTALL -elif [[ "$DISTRIB" == "mamba-cuml" ]]; then +elif [[ "$DISTRIB" == "conda-cuml" ]]; then TO_INSTALL="-c rapidsai -c nvidia -c conda-forge python=$PYTHON_VERSION" TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" From 0ceacfbb42ad25f631c181f0bea9b7ce29f38dc0 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 10:41:41 +0100 Subject: [PATCH 38/50] MNT move to circleci --- .circleci/config.yml | 28 +++++++++++++++++++++ build_tools/circle/build_test_cuml.sh | 35 +++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 build_tools/circle/build_test_cuml.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index 80d55554b..6e3c7df63 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -51,8 +51,36 @@ jobs: branches: ignore: gh-pages + cuml: + machine: + image: ubuntu-2004:202101-01 + resource_class: arm.medium + environment: + # Use the latest supported version of python + - PYTHON_VERSION: '3.8' + - NUMPY_VERSION: 'latest' + - SCIPY_VERSION: 'latest' + - SKLEARN_VERSION: 'latest' + - BLAZINGSQL_VERSION: 'min' + - CUML_VERSION: 'min' + - CUDATOOLKIT_VERSION: 'min' + steps: + - checkout + - run: ./build_tools/circle/checkout_merge_commit.sh + - restore_cache: + key: cuml-{{ .Branch }} + - run: ./build_tools/circle/build_test_cuml.sh + - save_cache: + key: cuml-{{ .Branch }} + paths: + - ~/.cache/pip + - ~/scikit_learn_data + workflows: version: 2 build-doc-and-deploy: jobs: - doc + cuml: + jobs: + - cuml diff --git a/build_tools/circle/build_test_cuml.sh b/build_tools/circle/build_test_cuml.sh new file mode 100644 index 000000000..50120067b --- /dev/null +++ b/build_tools/circle/build_test_cuml.sh @@ -0,0 +1,35 @@ + +#!/usr/bin/env bash +set -x +set -e + +# Install dependencies with miniconda +wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \ + -O miniconda.sh +chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH +export PATH="$MINICONDA_PATH/bin:$PATH" +conda update --yes --quiet conda + +# imports get_dep +source build_tools/shared.sh + +# packaging won't be needed once setuptools starts shipping packaging>=17.0 +mamba create -n $CONDA_ENV_NAME --yes --quiet \ + python="${PYTHON_VERSION:-*}" \ + "$(get_dep numpy $NUMPY_VERSION)" \ + "$(get_dep scipy $SCIPY_VERSION)" \ + "$(get_dep scikit-learn $SKLEARN_VERSION)" \ + "$(get_dep blazingsql $BLAZINGSQL_VERSION)" \ + "$(get_dep cuml $CUML_VERSION)" \ + "$(get_dep cudatoolkit $CUDATOOLKIT_VERSION)" \ + "$(get_dep pytest $CUDATOOLKIT_VERSION)" \ + pytest coverage pytest-cov pytest-xdist + +source activate $CONDA_ENV_NAME + +# Build and install imbalanced-learn in dev mode +ls -l +pip install -e . --no-build-isolation + +# Test the install +pytest -v --cov=imblearn imblearn -n 2 From ee6b7b07b58a45be38d5e7eb2976bceb28916eb6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Wed, 8 Dec 2021 10:51:36 +0100 Subject: [PATCH 39/50] iter --- .../workflows/circleci-artifacts-redirector.yml | 14 -------------- build_tools/circle/build_test_cuml.sh | 1 - 2 files changed, 15 deletions(-) delete mode 100644 .github/workflows/circleci-artifacts-redirector.yml diff --git a/.github/workflows/circleci-artifacts-redirector.yml b/.github/workflows/circleci-artifacts-redirector.yml deleted file mode 100644 index 71f0cd932..000000000 --- a/.github/workflows/circleci-artifacts-redirector.yml +++ /dev/null @@ -1,14 +0,0 @@ -name: circleci-artifacts-redirector - -on: [status] -jobs: - circleci_artifacts_redirector_job: - runs-on: ubuntu-latest - name: Run CircleCI artifacts redirector - steps: - - name: GitHub Action step - uses: larsoner/circleci-artifacts-redirector-action@master - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - artifact-path: doc/index.html - circleci-jobs: documentation diff --git a/build_tools/circle/build_test_cuml.sh b/build_tools/circle/build_test_cuml.sh index 50120067b..55bd20152 100644 --- a/build_tools/circle/build_test_cuml.sh +++ b/build_tools/circle/build_test_cuml.sh @@ -1,4 +1,3 @@ - #!/usr/bin/env bash set -x set -e From d089b7b1a03d887f33473c79b62531ac64a5a2d2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 15 Jan 2022 12:21:06 +0100 Subject: [PATCH 40/50] iter --- .circleci/config.yml | 28 ---------------------- azure-pipelines.yml | 9 ------- build_tools/azure/install.sh | 10 -------- build_tools/circle/build_test_cuml.sh | 34 --------------------------- imblearn/_min_dependencies.py | 6 ----- 5 files changed, 87 deletions(-) delete mode 100644 build_tools/circle/build_test_cuml.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index 6e3c7df63..80d55554b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -51,36 +51,8 @@ jobs: branches: ignore: gh-pages - cuml: - machine: - image: ubuntu-2004:202101-01 - resource_class: arm.medium - environment: - # Use the latest supported version of python - - PYTHON_VERSION: '3.8' - - NUMPY_VERSION: 'latest' - - SCIPY_VERSION: 'latest' - - SKLEARN_VERSION: 'latest' - - BLAZINGSQL_VERSION: 'min' - - CUML_VERSION: 'min' - - CUDATOOLKIT_VERSION: 'min' - steps: - - checkout - - run: ./build_tools/circle/checkout_merge_commit.sh - - restore_cache: - key: cuml-{{ .Branch }} - - run: ./build_tools/circle/build_test_cuml.sh - - save_cache: - key: cuml-{{ .Branch }} - paths: - - ~/.cache/pip - - ~/scikit_learn_data - workflows: version: 2 build-doc-and-deploy: jobs: - doc - cuml: - jobs: - - cuml diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e76edc681..d614ff1b3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -207,15 +207,6 @@ jobs: TEST_DOCS: 'true' TEST_DOCSTRINGS: 'false' # it is going to fail because of scikit-learn inheritance CHECK_WARNINGS: 'true' - pylatest_conda_cuml: - DISTRIB: 'conda-cuml' - CONDA_CHANNEL: 'defaults' - PYTHON_VERSION: '3.8' - BLAZINGSQL_VERSION: 'min' - CUML_VERSION: 'min' - CUDATOOLKIT_VERSION: 'min' - TEST_DOCS: 'true' - TEST_DOCSTRINGS: 'true' - template: build_tools/azure/posix-docker.yml parameters: diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 1d01fc79b..932c5b150 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -105,16 +105,6 @@ elif [[ "$DISTRIB" == "conda-minimum-keras" ]]; then TO_INSTALL="$TO_INSTALL $(get_dep keras $KERAS_VERSION)" make_conda $TO_INSTALL -elif [[ "$DISTRIB" == "conda-cuml" ]]; then - TO_INSTALL="-c rapidsai -c nvidia -c conda-forge python=$PYTHON_VERSION" - TO_INSTALL="$TO_INSTALL $(get_dep numpy $NUMPY_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep scipy $SCIPY_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep scikit-learn $SKLEARN_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep blazingsql $BLAZINGSQL_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep cuml $CUML_VERSION)" - TO_INSTALL="$TO_INSTALL $(get_dep cudatoolkit $CUDATOOLKIT_VERSION)" - make_conda $TO_INSTALL - elif [[ "$DISTRIB" == "conda-pip-scipy-dev" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip diff --git a/build_tools/circle/build_test_cuml.sh b/build_tools/circle/build_test_cuml.sh deleted file mode 100644 index 55bd20152..000000000 --- a/build_tools/circle/build_test_cuml.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -set -x -set -e - -# Install dependencies with miniconda -wget https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh \ - -O miniconda.sh -chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH -export PATH="$MINICONDA_PATH/bin:$PATH" -conda update --yes --quiet conda - -# imports get_dep -source build_tools/shared.sh - -# packaging won't be needed once setuptools starts shipping packaging>=17.0 -mamba create -n $CONDA_ENV_NAME --yes --quiet \ - python="${PYTHON_VERSION:-*}" \ - "$(get_dep numpy $NUMPY_VERSION)" \ - "$(get_dep scipy $SCIPY_VERSION)" \ - "$(get_dep scikit-learn $SKLEARN_VERSION)" \ - "$(get_dep blazingsql $BLAZINGSQL_VERSION)" \ - "$(get_dep cuml $CUML_VERSION)" \ - "$(get_dep cudatoolkit $CUDATOOLKIT_VERSION)" \ - "$(get_dep pytest $CUDATOOLKIT_VERSION)" \ - pytest coverage pytest-cov pytest-xdist - -source activate $CONDA_ENV_NAME - -# Build and install imbalanced-learn in dev mode -ls -l -pip install -e . --no-build-isolation - -# Test the install -pytest -v --cov=imblearn imblearn -n 2 diff --git a/imblearn/_min_dependencies.py b/imblearn/_min_dependencies.py index 2437cc5e8..11849ab85 100644 --- a/imblearn/_min_dependencies.py +++ b/imblearn/_min_dependencies.py @@ -18,9 +18,6 @@ JOBLIB_MIN_VERSION = "0.11" THREADPOOLCTL_MIN_VERSION = "2.0.0" PYTEST_MIN_VERSION = "5.0.1" -CUML_MIN_VERSION = "21.10" -BLAZINGSQL_MIN_VERSION = "21.10" -CUDATOOLKIT_MIN_VERSION = "11.0" # 'build' and 'install' is included to have structured metadata for CI. @@ -35,9 +32,6 @@ "pandas": (PANDAS_MIN_VERSION, "optional, docs, examples, tests"), "tensorflow": (TENSORFLOW_MIN_VERSION, "optional, docs, examples, tests"), "keras": (KERAS_MIN_VERSION, "optional, docs, examples, tests"), - "cuml": (CUML_MIN_VERSION, "optional, docs, examples, tests"), - "blazingsql": (BLAZINGSQL_MIN_VERSION, "optional, docs, examples, tests"), - "cudatoolkit": (CUDATOOLKIT_MIN_VERSION, "optional, docs, examples, tests"), "matplotlib": ("2.2.3", "docs, examples"), "seaborn": ("0.9.0", "docs, examples"), "memory_profiler": ("0.57.0", "docs"), From d815e2d9a49af84718d088c68c361f549455ef74 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 15 Jan 2022 13:10:38 +0100 Subject: [PATCH 41/50] create custom NN class --- imblearn/utils/testing.py | 28 ++++++++++++++++++++++++++-- imblearn/utils/tests/test_testing.py | 28 +++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index eaad65efb..5b200c064 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -9,13 +9,14 @@ import warnings from contextlib import contextmanager from importlib import import_module -from re import compile +from operator import itemgetter from pathlib import Path +from re import compile -from operator import itemgetter from pytest import warns as _warns from sklearn.base import BaseEstimator +from sklearn.neighbors import KDTree from sklearn.utils._testing import ignore_warnings @@ -164,3 +165,26 @@ def warns(expected_warning, match=None): assert False, msg else: pass + + +class CustomNearestNeighbors(BaseEstimator): + """Basic implementation of nearest neighbors not relying on scikit-learn.""" + + def __init__(self, n_neighbors=1): + self.n_neighbors = n_neighbors + + def fit(self, X, y=None): + self._kd_tree = KDTree(X) + return self + + def kneighbors(self, X, n_neighbors=None, return_distance=True): + n_neighbors = n_neighbors if n_neighbors is not None else self.n_neighbors + distances, indices = self._kd_tree.query(X, k=n_neighbors) + if return_distance: + return distances, indices + return indices + + def kneighbors_graph(X=None, n_neighbors=None, mode="connectivity"): + """This method is not used within imblearn but it is required for + duck-typing.""" + pass diff --git a/imblearn/utils/tests/test_testing.py b/imblearn/utils/tests/test_testing.py index 7f5e302c3..eb3b09ac7 100644 --- a/imblearn/utils/tests/test_testing.py +++ b/imblearn/utils/tests/test_testing.py @@ -5,8 +5,12 @@ import pytest +import numpy as np + +from sklearn.neighbors._base import KNeighborsMixin + from imblearn.base import SamplerMixin -from imblearn.utils.testing import all_estimators +from imblearn.utils.testing import all_estimators, CustomNearestNeighbors from imblearn.utils.testing import warns @@ -59,3 +63,25 @@ def test_warns_deprecation(): with warns(UserWarning): warnings.warn("value must be 42") assert "The warns function is deprecated" in str(record[0].message) + + +def test_custom_nearest_neighbors(): + """Check that our custom nearest neighbors can be used for our internal + duck-typing.""" + + neareat_neighbors = CustomNearestNeighbors(n_neighbors=3) + + assert not isinstance(neareat_neighbors, KNeighborsMixin) + assert hasattr(neareat_neighbors, "kneighbors") + assert hasattr(neareat_neighbors, "kneighbors_graph") + + rng = np.random.RandomState(42) + X = rng.randn(150, 3) + y = rng.randint(0, 2, 150) + neareat_neighbors.fit(X, y) + + distances, indices = neareat_neighbors.kneighbors(X) + assert distances.shape == (150, 3) + assert indices.shape == (150, 3) + np.testing.assert_allclose(distances[:, 0], 0.0) + np.testing.assert_allclose(indices[:, 0], np.arange(150)) From 964d082064a8818c23a2d57c845b362723638074 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 12:15:33 +0100 Subject: [PATCH 42/50] add test no dependent on cupy --- imblearn/over_sampling/_smote/filter.py | 2 - .../over_sampling/_smote/tests/test_common.py | 114 ++++++++++++++++++ .../over_sampling/_smote/tests/test_smote.py | 54 --------- imblearn/utils/testing.py | 11 +- 4 files changed, 123 insertions(+), 58 deletions(-) create mode 100644 imblearn/over_sampling/_smote/tests/test_common.py diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index 80b5cad50..3d303993b 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -154,7 +154,6 @@ def _validate_estimator(self): self.nn_m_ = check_neighbors_object( "m_neighbors", self.m_neighbors, additional_neighbor=1 ) - self.nn_m_.set_params(**{"n_jobs": self.n_jobs}) if self.kind not in ("borderline-1", "borderline-2"): raise ValueError( f'The possible "kind" of algorithm are ' @@ -382,7 +381,6 @@ def _validate_estimator(self): self.nn_m_ = check_neighbors_object( "m_neighbors", self.m_neighbors, additional_neighbor=1 ) - self.nn_m_.set_params(**{"n_jobs": self.n_jobs}) if self.svm_estimator is None: self.svm_estimator_ = SVC(gamma="scale", random_state=self.random_state) diff --git a/imblearn/over_sampling/_smote/tests/test_common.py b/imblearn/over_sampling/_smote/tests/test_common.py new file mode 100644 index 000000000..dea8046b0 --- /dev/null +++ b/imblearn/over_sampling/_smote/tests/test_common.py @@ -0,0 +1,114 @@ +from collections import Counter + +import pytest +import numpy as np + +from imblearn.over_sampling import ( + BorderlineSMOTE, + KMeansSMOTE, + SMOTE, + SMOTEN, + SMOTENC, + SVMSMOTE, +) +from imblearn.utils.testing import CustomNearestNeighbors + + +@pytest.fixture +def numerical_data(): + rng = np.random.RandomState(0) + X = rng.randn(100, 2) + y = np.repeat([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0], 5) + + return X, y + + +@pytest.fixture +def categorical_data(): + rng = np.random.RandomState(0) + + feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 + feature_2 = ["A"] * 40 + ["B"] * 20 + feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 + X = np.array([feature_1, feature_2, feature_3], dtype=object).T + rng.shuffle(X) + y = np.array([0] * 20 + [1] * 40, dtype=np.int32) + y_labels = np.array(["not apple", "apple"], dtype=object) + y = y_labels[y] + return X, y + + +@pytest.fixture +def heterogeneous_data(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=object) + X[:, :2] = rng.randn(30, 2) + X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) + X[:, 3] = rng.randint(3, size=30) + y = np.array([0] * 10 + [1] * 20) + return X, y, [2, 3] + + +@pytest.mark.parametrize( + "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"] +) +def test_smote_m_neighbors(numerical_data, smote): + # check that m_neighbors is properly set. Regression test for: + # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568 + X, y = numerical_data + _ = smote.fit_resample(X, y) + assert smote.nn_k_.n_neighbors == 6 + assert smote.nn_m_.n_neighbors == 11 + + +@pytest.mark.parametrize( + "smote", + [ + BorderlineSMOTE(random_state=0), + KMeansSMOTE(random_state=1), + SMOTE(random_state=0), + SVMSMOTE(random_state=0), + ], + ids=["borderline", "kmeans", "smote", "svm"], +) +def test_numerical_smote_k_custom_nn(numerical_data, smote): + X, y = numerical_data + smote.set_params(k_neighbors=CustomNearestNeighbors(n_neighbors=5)) + X_res, y_res = smote.fit_resample(X, y) + + assert X_res.shape == (120, 2) + assert Counter(y_res) == {0: 60, 1: 60} + + +def test_categorical_smote_k_custom_nn(categorical_data): + X, y = categorical_data + smote = SMOTEN(k_neighbors=CustomNearestNeighbors(n_neighbors=5)) + X_res, y_res = smote.fit_resample(X, y) + + assert X_res.shape == (80, 3) + assert Counter(y_res) == {"apple": 40, "not apple": 40} + + +def test_heterogeneous_smote_k_custom_nn(heterogeneous_data): + X, y, categorical_features = heterogeneous_data + smote = SMOTENC( + categorical_features, k_neighbors=CustomNearestNeighbors(n_neighbors=5) + ) + X_res, y_res = smote.fit_resample(X, y) + + assert X_res.shape == (40, 4) + assert Counter(y_res) == {0: 20, 1: 20} + + +@pytest.mark.parametrize( + "smote", + [BorderlineSMOTE(random_state=0), SVMSMOTE(random_state=0)], + ids=["borderline", "svm"], +) +def test_numerical_smote_extra_custom_nn(numerical_data, smote): + X, y = numerical_data + smote.set_params(m_neighbors=CustomNearestNeighbors(n_neighbors=5)) + X_res, y_res = smote.fit_resample(X, y) + + assert X_res.shape == (120, 2) + assert Counter(y_res) == {0: 60, 1: 60} diff --git a/imblearn/over_sampling/_smote/tests/test_smote.py b/imblearn/over_sampling/_smote/tests/test_smote.py index 1cda3e5b8..27e8e8b79 100644 --- a/imblearn/over_sampling/_smote/tests/test_smote.py +++ b/imblearn/over_sampling/_smote/tests/test_smote.py @@ -4,15 +4,12 @@ # License: MIT import numpy as np -import pytest from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_array_equal from sklearn.neighbors import NearestNeighbors from imblearn.over_sampling import SMOTE -from imblearn.over_sampling import SVMSMOTE -from imblearn.over_sampling import BorderlineSMOTE RND_SEED = 0 @@ -153,54 +150,3 @@ def test_sample_regular_with_nn(): ) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt) - - -@pytest.mark.parametrize( - "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"] -) -def test_smote_m_neighbors(smote): - # check that m_neighbors is properly set. Regression test for: - # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568 - _ = smote.fit_resample(X, Y) - assert smote.nn_k_.n_neighbors == 6 - assert smote.nn_m_.n_neighbors == 11 - - -def test_sample_cuml_with_nn(): - cuml = pytest.importorskip("cuml") - nn_k = cuml.neighbors.NearestNeighbors(n_neighbors=2) - smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) - X_resampled, y_resampled = smote.fit_resample(X, Y) - X_gt = np.array( - [ - [0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234], - [1.10580062, 0.00601499], - [1.60506454, -0.31959815], - [1.40109204, -0.74276846], - [0.38584956, -0.20702218], - ] - ) - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0] - ) - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index 5b200c064..c3ce54016 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -13,6 +13,7 @@ from pathlib import Path from re import compile +from scipy import sparse from pytest import warns as _warns from sklearn.base import BaseEstimator @@ -168,17 +169,23 @@ def warns(expected_warning, match=None): class CustomNearestNeighbors(BaseEstimator): - """Basic implementation of nearest neighbors not relying on scikit-learn.""" + """Basic implementation of nearest neighbors not relying on scikit-learn. - def __init__(self, n_neighbors=1): + `kneighbors_graph` is ignored and `metric` does not have any impact. + """ + + def __init__(self, n_neighbors=1, metric="euclidean"): self.n_neighbors = n_neighbors + self.metric = metric def fit(self, X, y=None): + X = X.toarray() if sparse.issparse(X) else X self._kd_tree = KDTree(X) return self def kneighbors(self, X, n_neighbors=None, return_distance=True): n_neighbors = n_neighbors if n_neighbors is not None else self.n_neighbors + X = X.toarray() if sparse.issparse(X) else X distances, indices = self._kd_tree.query(X, k=n_neighbors) if return_distance: return distances, indices From 99d52063732424682d7553eb26c5ea918d6dbf66 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 12:35:02 +0100 Subject: [PATCH 43/50] update documentation --- imblearn/over_sampling/_smote/base.py | 45 ++++++++++---- imblearn/over_sampling/_smote/cluster.py | 15 +++-- imblearn/over_sampling/_smote/filter.py | 60 ++++++++++++++----- .../tests/test_cluster_centroids.py | 14 +---- imblearn/utils/testing.py | 10 ++++ 5 files changed, 100 insertions(+), 44 deletions(-) diff --git a/imblearn/over_sampling/_smote/base.py b/imblearn/over_sampling/_smote/base.py index 23bff84c1..01540e19f 100644 --- a/imblearn/over_sampling/_smote/base.py +++ b/imblearn/over_sampling/_smote/base.py @@ -224,10 +224,17 @@ class SMOTE(BaseSMOTE): {random_state} k_neighbors : int or object, default=5 - If ``int``, number of nearest neighbours to used to construct synthetic - samples. If object, an estimator that inherits from - :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the k_neighbors. + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. {n_jobs} @@ -367,10 +374,17 @@ class SMOTENC(SMOTE): {random_state} k_neighbors : int or object, default=5 - If ``int``, number of nearest neighbours to used to construct synthetic - samples. If object, an estimator that inherits from - :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the k_neighbors. + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. {n_jobs} @@ -636,10 +650,17 @@ class SMOTEN(SMOTE): {random_state} k_neighbors : int or object, default=5 - If ``int``, number of nearest neighbours to used to construct synthetic - samples. If object, an estimator that inherits from - :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the k_neighbors. + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. {n_jobs} diff --git a/imblearn/over_sampling/_smote/cluster.py b/imblearn/over_sampling/_smote/cluster.py index c18e9b7db..871577486 100644 --- a/imblearn/over_sampling/_smote/cluster.py +++ b/imblearn/over_sampling/_smote/cluster.py @@ -45,10 +45,17 @@ class KMeansSMOTE(BaseSMOTE): {random_state} k_neighbors : int or object, default=2 - If ``int``, number of nearest neighbours to used to construct synthetic - samples. If object, an estimator that inherits from - :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the k_neighbors. + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. {n_jobs} diff --git a/imblearn/over_sampling/_smote/filter.py b/imblearn/over_sampling/_smote/filter.py index 3d303993b..93aebc235 100644 --- a/imblearn/over_sampling/_smote/filter.py +++ b/imblearn/over_sampling/_smote/filter.py @@ -47,18 +47,32 @@ class BorderlineSMOTE(BaseSMOTE): {random_state} k_neighbors : int or object, default=5 - If ``int``, number of nearest neighbours to used to construct synthetic - samples. If object, an estimator that inherits from - :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the k_neighbors. + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. {n_jobs} m_neighbors : int or object, default=10 - If int, number of nearest neighbours to use to determine if a minority - sample is in danger. If object, an estimator that inherits - from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used - to find the m_neighbors. + The nearest neighbors used to determine if a minority sample is in + "danger". You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. kind : {{"borderline-1", "borderline-2"}}, default='borderline-1' The type of SMOTE algorithm to use one of the following options: @@ -261,18 +275,32 @@ class SVMSMOTE(BaseSMOTE): {random_state} k_neighbors : int or object, default=5 - If ``int``, number of nearest neighbours to used to construct synthetic - samples. If object, an estimator that inherits from - :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the k_neighbors. + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. {n_jobs} m_neighbors : int or object, default=10 - If int, number of nearest neighbours to use to determine if a minority - sample is in danger. If object, an estimator that inherits from - :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the m_neighbors. + The nearest neighbors used to determine if a minority sample is in + "danger". You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. svm_estimator : estimator object, default=SVC() A parametrized :class:`~sklearn.svm.SVC` classifier can be passed. diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index bb8c9bd0c..291522930 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -5,12 +5,12 @@ import numpy as np from scipy import sparse -from sklearn.base import BaseEstimator from sklearn.linear_model import LogisticRegression from sklearn.cluster import KMeans from sklearn.datasets import make_classification from imblearn.under_sampling import ClusterCentroids +from imblearn.utils.testing import CustomClusterer RND_SEED = 0 X = np.array( @@ -155,16 +155,6 @@ def test_cluster_centroids_hard_target_class(): assert sum(sample_from_minority_in_majority) == 0 -class FakeCluster(BaseEstimator): - """Class that mimics a cluster that does not expose `cluster_centers_`.""" - - def __init__(self, n_clusters=1): - self.n_clusters = n_clusters - - def fit(self, X, y=None): - return self - - def test_cluster_centroids_error_estimator(): """Check that an error is raised when estimator does not have a cluster API.""" @@ -180,4 +170,4 @@ def test_cluster_centroids_error_estimator(): "`cluster_centers_`." ) with pytest.raises(RuntimeError, match=err_msg): - ClusterCentroids(estimator=FakeCluster()).fit_resample(X, Y) + ClusterCentroids(estimator=CustomClusterer()).fit_resample(X, Y) diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index c3ce54016..12d09e8b5 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -195,3 +195,13 @@ def kneighbors_graph(X=None, n_neighbors=None, mode="connectivity"): """This method is not used within imblearn but it is required for duck-typing.""" pass + + +class CustomClusterer(BaseEstimator): + """Class that mimics a cluster that does not expose `cluster_centers_`.""" + + def __init__(self, n_clusters=1): + self.n_clusters = n_clusters + + def fit(self, X, y=None): + return self From 48d1fd5741646f16cb8f276926a2b66a6b29f30f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 13:09:48 +0100 Subject: [PATCH 44/50] iter --- imblearn/over_sampling/_adasyn.py | 16 ++- .../over_sampling/_smote/tests/test_common.py | 114 ------------------ 2 files changed, 11 insertions(+), 119 deletions(-) delete mode 100644 imblearn/over_sampling/_smote/tests/test_common.py diff --git a/imblearn/over_sampling/_adasyn.py b/imblearn/over_sampling/_adasyn.py index cbfeeda22..bd50378c2 100644 --- a/imblearn/over_sampling/_adasyn.py +++ b/imblearn/over_sampling/_adasyn.py @@ -39,10 +39,17 @@ class ADASYN(BaseOverSampler): {random_state} n_neighbors : int or estimator object, default=5 - If ``int``, number of nearest neighbours to used to construct synthetic - samples. If object, an estimator that inherits from - :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to - find the k_neighbors. + The nearest neighbors used to define the neighborhood of samples to use + to generate the synthetic samples. You can pass: + + - an `int` corresponding to the number of neighbors to use. A + `~sklearn.neighbors.NearestNeighbors` instance will be fitted in this + case. + - an instance of a compatible nearest neighbors algorithm that should + implement both methods `kneighbors` and `kneighbors_graph`. For + instance, it could correspond to a + :class:`~sklearn.neighbors.NearestNeighbors` but could be extended to + any compatible class. {n_jobs} @@ -124,7 +131,6 @@ def _validate_estimator(self): self.nn_ = check_neighbors_object( "n_neighbors", self.n_neighbors, additional_neighbor=1 ) - self.nn_.set_params(**{"n_jobs": self.n_jobs}) def _fit_resample(self, X, y): self._validate_estimator() diff --git a/imblearn/over_sampling/_smote/tests/test_common.py b/imblearn/over_sampling/_smote/tests/test_common.py deleted file mode 100644 index dea8046b0..000000000 --- a/imblearn/over_sampling/_smote/tests/test_common.py +++ /dev/null @@ -1,114 +0,0 @@ -from collections import Counter - -import pytest -import numpy as np - -from imblearn.over_sampling import ( - BorderlineSMOTE, - KMeansSMOTE, - SMOTE, - SMOTEN, - SMOTENC, - SVMSMOTE, -) -from imblearn.utils.testing import CustomNearestNeighbors - - -@pytest.fixture -def numerical_data(): - rng = np.random.RandomState(0) - X = rng.randn(100, 2) - y = np.repeat([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0], 5) - - return X, y - - -@pytest.fixture -def categorical_data(): - rng = np.random.RandomState(0) - - feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 - feature_2 = ["A"] * 40 + ["B"] * 20 - feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 - X = np.array([feature_1, feature_2, feature_3], dtype=object).T - rng.shuffle(X) - y = np.array([0] * 20 + [1] * 40, dtype=np.int32) - y_labels = np.array(["not apple", "apple"], dtype=object) - y = y_labels[y] - return X, y - - -@pytest.fixture -def heterogeneous_data(): - rng = np.random.RandomState(42) - X = np.empty((30, 4), dtype=object) - X[:, :2] = rng.randn(30, 2) - X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) - X[:, 3] = rng.randint(3, size=30) - y = np.array([0] * 10 + [1] * 20) - return X, y, [2, 3] - - -@pytest.mark.parametrize( - "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"] -) -def test_smote_m_neighbors(numerical_data, smote): - # check that m_neighbors is properly set. Regression test for: - # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568 - X, y = numerical_data - _ = smote.fit_resample(X, y) - assert smote.nn_k_.n_neighbors == 6 - assert smote.nn_m_.n_neighbors == 11 - - -@pytest.mark.parametrize( - "smote", - [ - BorderlineSMOTE(random_state=0), - KMeansSMOTE(random_state=1), - SMOTE(random_state=0), - SVMSMOTE(random_state=0), - ], - ids=["borderline", "kmeans", "smote", "svm"], -) -def test_numerical_smote_k_custom_nn(numerical_data, smote): - X, y = numerical_data - smote.set_params(k_neighbors=CustomNearestNeighbors(n_neighbors=5)) - X_res, y_res = smote.fit_resample(X, y) - - assert X_res.shape == (120, 2) - assert Counter(y_res) == {0: 60, 1: 60} - - -def test_categorical_smote_k_custom_nn(categorical_data): - X, y = categorical_data - smote = SMOTEN(k_neighbors=CustomNearestNeighbors(n_neighbors=5)) - X_res, y_res = smote.fit_resample(X, y) - - assert X_res.shape == (80, 3) - assert Counter(y_res) == {"apple": 40, "not apple": 40} - - -def test_heterogeneous_smote_k_custom_nn(heterogeneous_data): - X, y, categorical_features = heterogeneous_data - smote = SMOTENC( - categorical_features, k_neighbors=CustomNearestNeighbors(n_neighbors=5) - ) - X_res, y_res = smote.fit_resample(X, y) - - assert X_res.shape == (40, 4) - assert Counter(y_res) == {0: 20, 1: 20} - - -@pytest.mark.parametrize( - "smote", - [BorderlineSMOTE(random_state=0), SVMSMOTE(random_state=0)], - ids=["borderline", "svm"], -) -def test_numerical_smote_extra_custom_nn(numerical_data, smote): - X, y = numerical_data - smote.set_params(m_neighbors=CustomNearestNeighbors(n_neighbors=5)) - X_res, y_res = smote.fit_resample(X, y) - - assert X_res.shape == (120, 2) - assert Counter(y_res) == {0: 60, 1: 60} From 18b60572dcb0386e4ff7f3ee6b90bc7c9a3f28e4 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 13:09:58 +0100 Subject: [PATCH 45/50] iter --- imblearn/over_sampling/tests/test_common.py | 118 ++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 imblearn/over_sampling/tests/test_common.py diff --git a/imblearn/over_sampling/tests/test_common.py b/imblearn/over_sampling/tests/test_common.py new file mode 100644 index 000000000..57bc6cbe3 --- /dev/null +++ b/imblearn/over_sampling/tests/test_common.py @@ -0,0 +1,118 @@ +from collections import Counter + +import pytest +import numpy as np + +from imblearn.over_sampling import ( + ADASYN, + BorderlineSMOTE, + KMeansSMOTE, + SMOTE, + SMOTEN, + SMOTENC, + SVMSMOTE, +) +from imblearn.utils.testing import CustomNearestNeighbors + + +@pytest.fixture +def numerical_data(): + rng = np.random.RandomState(0) + X = rng.randn(100, 2) + y = np.repeat([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0], 5) + + return X, y + + +@pytest.fixture +def categorical_data(): + rng = np.random.RandomState(0) + + feature_1 = ["A"] * 10 + ["B"] * 20 + ["C"] * 30 + feature_2 = ["A"] * 40 + ["B"] * 20 + feature_3 = ["A"] * 20 + ["B"] * 20 + ["C"] * 10 + ["D"] * 10 + X = np.array([feature_1, feature_2, feature_3], dtype=object).T + rng.shuffle(X) + y = np.array([0] * 20 + [1] * 40, dtype=np.int32) + y_labels = np.array(["not apple", "apple"], dtype=object) + y = y_labels[y] + return X, y + + +@pytest.fixture +def heterogeneous_data(): + rng = np.random.RandomState(42) + X = np.empty((30, 4), dtype=object) + X[:, :2] = rng.randn(30, 2) + X[:, 2] = rng.choice(["a", "b", "c"], size=30).astype(object) + X[:, 3] = rng.randint(3, size=30) + y = np.array([0] * 10 + [1] * 20) + return X, y, [2, 3] + + +@pytest.mark.parametrize( + "smote", [BorderlineSMOTE(), SVMSMOTE()], ids=["borderline", "svm"] +) +def test_smote_m_neighbors(numerical_data, smote): + # check that m_neighbors is properly set. Regression test for: + # https://github.com/scikit-learn-contrib/imbalanced-learn/issues/568 + X, y = numerical_data + _ = smote.fit_resample(X, y) + assert smote.nn_k_.n_neighbors == 6 + assert smote.nn_m_.n_neighbors == 11 + + +@pytest.mark.parametrize( + "smote, neighbor_estimator_name", + [ + (ADASYN(random_state=0), "n_neighbors"), + (BorderlineSMOTE(random_state=0), "k_neighbors"), + (KMeansSMOTE(random_state=1), "k_neighbors"), + (SMOTE(random_state=0), "k_neighbors"), + (SVMSMOTE(random_state=0), "k_neighbors"), + ], + ids=["adasyn", "borderline", "kmeans", "smote", "svm"], +) +def test_numerical_smote_custom_nn(numerical_data, smote, neighbor_estimator_name): + X, y = numerical_data + params = { + neighbor_estimator_name: CustomNearestNeighbors(n_neighbors=5), + } + smote.set_params(**params) + X_res, _ = smote.fit_resample(X, y) + + assert X_res.shape[0] >= 120 + + +def test_categorical_smote_k_custom_nn(categorical_data): + X, y = categorical_data + smote = SMOTEN(k_neighbors=CustomNearestNeighbors(n_neighbors=5)) + X_res, y_res = smote.fit_resample(X, y) + + assert X_res.shape == (80, 3) + assert Counter(y_res) == {"apple": 40, "not apple": 40} + + +def test_heterogeneous_smote_k_custom_nn(heterogeneous_data): + X, y, categorical_features = heterogeneous_data + smote = SMOTENC( + categorical_features, k_neighbors=CustomNearestNeighbors(n_neighbors=5) + ) + X_res, y_res = smote.fit_resample(X, y) + + assert X_res.shape == (40, 4) + assert Counter(y_res) == {0: 20, 1: 20} + + +@pytest.mark.parametrize( + "smote", + [BorderlineSMOTE(random_state=0), SVMSMOTE(random_state=0)], + ids=["borderline", "svm"], +) +def test_numerical_smote_extra_custom_nn(numerical_data, smote): + X, y = numerical_data + smote.set_params(m_neighbors=CustomNearestNeighbors(n_neighbors=5)) + X_res, y_res = smote.fit_resample(X, y) + + assert X_res.shape == (120, 2) + assert Counter(y_res) == {0: 60, 1: 60} From 76fbd596cc5bb9c42382a5b34356370e2e8d7892 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 13:11:38 +0100 Subject: [PATCH 46/50] revert redirector --- .../workflows/circleci-artifacts-redirector.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/circleci-artifacts-redirector.yml diff --git a/.github/workflows/circleci-artifacts-redirector.yml b/.github/workflows/circleci-artifacts-redirector.yml new file mode 100644 index 000000000..71f0cd932 --- /dev/null +++ b/.github/workflows/circleci-artifacts-redirector.yml @@ -0,0 +1,14 @@ +name: circleci-artifacts-redirector + +on: [status] +jobs: + circleci_artifacts_redirector_job: + runs-on: ubuntu-latest + name: Run CircleCI artifacts redirector + steps: + - name: GitHub Action step + uses: larsoner/circleci-artifacts-redirector-action@master + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + artifact-path: doc/index.html + circleci-jobs: documentation From 8fa97ed64cbff65a256b8adfa52d25e37bb6a3ba Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 13:15:43 +0100 Subject: [PATCH 47/50] add changelog --- doc/whats_new/v0.10.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.10.rst b/doc/whats_new/v0.10.rst index ea585d986..ff37d151a 100644 --- a/doc/whats_new/v0.10.rst +++ b/doc/whats_new/v0.10.rst @@ -5,3 +5,11 @@ Version 0.10.0 (ongoing) Changelog --------- + +Enhancements +............ + +- Add support to accept compatible `NearestNeighbors` objects by only + duck-typing. For instance, it allows to accept cuML instances. + :pr:`858` by :user:`NV-jpt ` and + :user:`Guillaume Lemaitre `. From 615a2bf1ef8243c04b879c4964fdae44fdc4d553 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 13:16:44 +0100 Subject: [PATCH 48/50] remove duplicated test --- imblearn/over_sampling/tests/test_adasyn.py | 43 --------------------- 1 file changed, 43 deletions(-) diff --git a/imblearn/over_sampling/tests/test_adasyn.py b/imblearn/over_sampling/tests/test_adasyn.py index 4d7bd38fd..65ce69182 100644 --- a/imblearn/over_sampling/tests/test_adasyn.py +++ b/imblearn/over_sampling/tests/test_adasyn.py @@ -142,46 +142,3 @@ def test_adasyn_error(adasyn_params, err_msg): adasyn = ADASYN(**adasyn_params) with pytest.raises(ValueError, match=err_msg): adasyn.fit_resample(X, Y) - - -def test_ada_fit_resample_cuml_nn_obj(): - cuml = pytest.importorskip("cuml") - nn = cuml.neighbors.NearestNeighbors(n_neighbors=2) - ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) - X_resampled, y_resampled = ada.fit_resample(X, Y) - X_gt = np.array( - [ - [0.11622591, -0.0317206], - [0.77481731, 0.60935141], - [1.25192108, -0.22367336], - [0.53366841, -0.30312976], - [1.52091956, -0.49283504], - [-0.28162401, -2.10400981], - [0.83680821, 1.72827342], - [0.3084254, 0.33299982], - [0.70472253, -0.73309052], - [0.28893132, -0.38761769], - [1.15514042, 0.0129463], - [0.88407872, 0.35454207], - [1.31301027, -0.92648734], - [-1.11515198, -0.93689695], - [-0.18410027, -0.45194484], - [0.9281014, 0.53085498], - [-0.14374509, 0.27370049], - [-0.41635887, -0.38299653], - [0.08711622, 0.93259929], - [1.70580611, -0.11219234], - [0.34532399, -0.18067361], - [1.44430593, -0.41617493], - [0.28204936, -0.13953426], - [1.08450984, 0.03948221], - [-0.19072677, -0.2341768], - ] - ) - - y_gt = np.array( - [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0] - ) - - assert_allclose(X_resampled, X_gt, rtol=R_TOL) - assert_array_equal(y_resampled, y_gt) From b75b77d10f9ecdc36fee0d897d0f672b6cf01c1f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 13:44:58 +0100 Subject: [PATCH 49/50] make testing function private --- imblearn/over_sampling/tests/test_common.py | 10 +++++----- .../tests/test_cluster_centroids.py | 4 ++-- imblearn/utils/testing.py | 4 ++-- imblearn/utils/tests/test_testing.py | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/imblearn/over_sampling/tests/test_common.py b/imblearn/over_sampling/tests/test_common.py index 57bc6cbe3..804382c08 100644 --- a/imblearn/over_sampling/tests/test_common.py +++ b/imblearn/over_sampling/tests/test_common.py @@ -12,7 +12,7 @@ SMOTENC, SVMSMOTE, ) -from imblearn.utils.testing import CustomNearestNeighbors +from imblearn.utils.testing import _CustomNearestNeighbors @pytest.fixture @@ -76,7 +76,7 @@ def test_smote_m_neighbors(numerical_data, smote): def test_numerical_smote_custom_nn(numerical_data, smote, neighbor_estimator_name): X, y = numerical_data params = { - neighbor_estimator_name: CustomNearestNeighbors(n_neighbors=5), + neighbor_estimator_name: _CustomNearestNeighbors(n_neighbors=5), } smote.set_params(**params) X_res, _ = smote.fit_resample(X, y) @@ -86,7 +86,7 @@ def test_numerical_smote_custom_nn(numerical_data, smote, neighbor_estimator_nam def test_categorical_smote_k_custom_nn(categorical_data): X, y = categorical_data - smote = SMOTEN(k_neighbors=CustomNearestNeighbors(n_neighbors=5)) + smote = SMOTEN(k_neighbors=_CustomNearestNeighbors(n_neighbors=5)) X_res, y_res = smote.fit_resample(X, y) assert X_res.shape == (80, 3) @@ -96,7 +96,7 @@ def test_categorical_smote_k_custom_nn(categorical_data): def test_heterogeneous_smote_k_custom_nn(heterogeneous_data): X, y, categorical_features = heterogeneous_data smote = SMOTENC( - categorical_features, k_neighbors=CustomNearestNeighbors(n_neighbors=5) + categorical_features, k_neighbors=_CustomNearestNeighbors(n_neighbors=5) ) X_res, y_res = smote.fit_resample(X, y) @@ -111,7 +111,7 @@ def test_heterogeneous_smote_k_custom_nn(heterogeneous_data): ) def test_numerical_smote_extra_custom_nn(numerical_data, smote): X, y = numerical_data - smote.set_params(m_neighbors=CustomNearestNeighbors(n_neighbors=5)) + smote.set_params(m_neighbors=_CustomNearestNeighbors(n_neighbors=5)) X_res, y_res = smote.fit_resample(X, y) assert X_res.shape == (120, 2) diff --git a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py index 291522930..d7399dffd 100644 --- a/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py +++ b/imblearn/under_sampling/_prototype_generation/tests/test_cluster_centroids.py @@ -10,7 +10,7 @@ from sklearn.datasets import make_classification from imblearn.under_sampling import ClusterCentroids -from imblearn.utils.testing import CustomClusterer +from imblearn.utils.testing import _CustomClusterer RND_SEED = 0 X = np.array( @@ -170,4 +170,4 @@ def test_cluster_centroids_error_estimator(): "`cluster_centers_`." ) with pytest.raises(RuntimeError, match=err_msg): - ClusterCentroids(estimator=CustomClusterer()).fit_resample(X, Y) + ClusterCentroids(estimator=_CustomClusterer()).fit_resample(X, Y) diff --git a/imblearn/utils/testing.py b/imblearn/utils/testing.py index 12d09e8b5..88abebd1b 100644 --- a/imblearn/utils/testing.py +++ b/imblearn/utils/testing.py @@ -168,7 +168,7 @@ def warns(expected_warning, match=None): pass -class CustomNearestNeighbors(BaseEstimator): +class _CustomNearestNeighbors(BaseEstimator): """Basic implementation of nearest neighbors not relying on scikit-learn. `kneighbors_graph` is ignored and `metric` does not have any impact. @@ -197,7 +197,7 @@ def kneighbors_graph(X=None, n_neighbors=None, mode="connectivity"): pass -class CustomClusterer(BaseEstimator): +class _CustomClusterer(BaseEstimator): """Class that mimics a cluster that does not expose `cluster_centers_`.""" def __init__(self, n_clusters=1): diff --git a/imblearn/utils/tests/test_testing.py b/imblearn/utils/tests/test_testing.py index eb3b09ac7..5d4ce2bde 100644 --- a/imblearn/utils/tests/test_testing.py +++ b/imblearn/utils/tests/test_testing.py @@ -10,7 +10,7 @@ from sklearn.neighbors._base import KNeighborsMixin from imblearn.base import SamplerMixin -from imblearn.utils.testing import all_estimators, CustomNearestNeighbors +from imblearn.utils.testing import all_estimators, _CustomNearestNeighbors from imblearn.utils.testing import warns @@ -69,7 +69,7 @@ def test_custom_nearest_neighbors(): """Check that our custom nearest neighbors can be used for our internal duck-typing.""" - neareat_neighbors = CustomNearestNeighbors(n_neighbors=3) + neareat_neighbors = _CustomNearestNeighbors(n_neighbors=3) assert not isinstance(neareat_neighbors, KNeighborsMixin) assert hasattr(neareat_neighbors, "kneighbors") From b627cf1cc471256c3fce3275401e947315024f28 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sun, 16 Jan 2022 14:07:52 +0100 Subject: [PATCH 50/50] iter --- imblearn/utils/tests/test_validation.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 284abb3a6..81db35a37 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -9,15 +9,14 @@ import pytest import numpy as np -from sklearn.base import BaseEstimator from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors from sklearn.utils._testing import assert_array_equal -from imblearn.utils.testing import warns from imblearn.utils import check_neighbors_object from imblearn.utils import check_sampling_strategy from imblearn.utils import check_target_type +from imblearn.utils.testing import warns, _CustomNearestNeighbors from imblearn.utils._validation import ArraysTransformer from imblearn.utils._validation import _deprecate_positional_args @@ -25,16 +24,6 @@ binary_target = np.array([1] * 25 + [0] * 100) -class KNNLikeEstimator(BaseEstimator): - """A class exposing the same KNeighborsMixin API than KNeighborsClassifier.""" - - def kneighbors(self, X): - return np.ones((len(X), 1)) - - def kneighbors_graph(self, X): - return np.ones((len(X), 1)) - - def test_check_neighbors_object(): name = "n_neighbors" n_neighbors = 1 @@ -47,9 +36,9 @@ def test_check_neighbors_object(): estimator = NearestNeighbors(n_neighbors=n_neighbors) estimator_cloned = check_neighbors_object(name, estimator) assert estimator.n_neighbors == estimator_cloned.n_neighbors - estimator = KNNLikeEstimator() + estimator = _CustomNearestNeighbors() estimator_cloned = check_neighbors_object(name, estimator) - assert isinstance(estimator_cloned, KNNLikeEstimator) + assert isinstance(estimator_cloned, _CustomNearestNeighbors) n_neighbors = "rnd" err_msg = ( "n_neighbors must be an interger or an object compatible with the "