From 520c69feffa72b8989b1c6b14aef45d739bb99e5 Mon Sep 17 00:00:00 2001 From: Frits Hermans Date: Fri, 31 Jan 2025 16:20:59 +0100 Subject: [PATCH 01/18] implement InstanceHardnessCV --- imblearn/cross_validation/__init__.py | 3 + .../cross_validation/_cross_validation.py | 68 +++++++++++++++++++ imblearn/cross_validation/tests/__init__.py | 0 .../tests/test_instance_hardness.py | 30 ++++++++ 4 files changed, 101 insertions(+) create mode 100644 imblearn/cross_validation/__init__.py create mode 100644 imblearn/cross_validation/_cross_validation.py create mode 100644 imblearn/cross_validation/tests/__init__.py create mode 100644 imblearn/cross_validation/tests/test_instance_hardness.py diff --git a/imblearn/cross_validation/__init__.py b/imblearn/cross_validation/__init__.py new file mode 100644 index 000000000..b6f646989 --- /dev/null +++ b/imblearn/cross_validation/__init__.py @@ -0,0 +1,3 @@ +from ._cross_validation import InstanceHardnessCV + +__all__ = ["InstanceHardnessCV"] diff --git a/imblearn/cross_validation/_cross_validation.py b/imblearn/cross_validation/_cross_validation.py new file mode 100644 index 000000000..b5ea76667 --- /dev/null +++ b/imblearn/cross_validation/_cross_validation.py @@ -0,0 +1,68 @@ +import numpy as np +import pandas as pd +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import StratifiedGroupKFold, cross_val_predict + + +class InstanceHardnessCV: + """Instance-hardness CV splitter + + CV splitter that distributes samples with large instance hardness equally + over the folds + + Parameters + ---------- + n_splits : int, default=5 + Number of folds. Must be at least 2. + + clf : classifier, default=None + Classifier used to determine instance hardness. Defaults to + RandomForestClassifier when set to `None` + + random_state : int, RandomState instance, default=None + Determines random_state for reproducible results across multiple calls. + + Examples + -------- + >>> from imblearn.cross_validation import InstanceHardnessCV + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import cross_validate + >>> from sklearn.linear_model import LogisticRegression + >>> X, y = make_classification(weights=[0.9, 0.1], class_sep=2, + ... n_informative=3, n_redundant=1, flip_y=0.05, n_samples=1000, random_state=10) + >>> ih_cv = InstanceHardnessCV(n_splits=5, random_state=10) + >>> clf = LogisticRegression(random_state=10) + >>> cv_result = cross_validate(clf, X, y, cv=ih_cv) + >>> print(f"Standard deviation of test_scores: {cv_result['test_score'].std():.3f}") + Standard deviation of test_scores: 0.005 + """ + + def __init__(self, n_splits=5, clf=None, random_state=None): + self.n_splits = n_splits + self.clf = clf + self.random_state = random_state + + def split(self, X, y, groups=None): + df = pd.DataFrame(X) + features = df.columns + df["y"] = y + if self.clf is not None: + self.clf_ = self.clf + else: + self.clf_ = RandomForestClassifier( + n_jobs=-1, class_weight="balanced", random_state=self.random_state + ) + df["proba"] = cross_val_predict( + self.clf_, df[features], df["y"], cv=self.n_splits, method="predict_proba" + )[:, 1] + df["hardness"] = abs(df["y"] - df["proba"]) + df = df.sort_values("hardness") + df["group"] = np.arange(len(df)) % self.n_splits + cv = StratifiedGroupKFold( + n_splits=self.n_splits, shuffle=True, random_state=self.random_state + ) + for train_index, test_index in cv.split(df[features], df["y"], df["group"]): + yield train_index, test_index + + def get_n_splits(self, X=None, y=None, groups=None): + return self.n_splits diff --git a/imblearn/cross_validation/tests/__init__.py b/imblearn/cross_validation/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/imblearn/cross_validation/tests/test_instance_hardness.py b/imblearn/cross_validation/tests/test_instance_hardness.py new file mode 100644 index 000000000..7e3b91fed --- /dev/null +++ b/imblearn/cross_validation/tests/test_instance_hardness.py @@ -0,0 +1,30 @@ +import pytest +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import cross_validate +from sklearn.utils._testing import assert_almost_equal + +from imblearn.cross_validation import InstanceHardnessCV + +X, y = make_classification( + weights=[0.9, 0.1], + class_sep=2, + n_informative=3, + n_redundant=1, + flip_y=0.05, + n_samples=1000, + random_state=10, +) + + +def test_instancehardness_cv(): + ih_cv = InstanceHardnessCV() + clf = LogisticRegression(random_state=10) + cv_result = cross_validate(clf, X, y, cv=ih_cv) + assert_almost_equal(cv_result["test_score"].std(), 0.005, decimal=3) + + +@pytest.mark.parametrize("n_splits", [2, 3, 4]) +def test_instancehardness_cv_n_splits(n_splits): + ih_cv = InstanceHardnessCV(n_splits=n_splits, random_state=10) + assert ih_cv.get_n_splits() == n_splits From f54f0352a4f86c8a84e946abbf6e64dc382cfc20 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Sat, 1 Feb 2025 13:15:40 +0100 Subject: [PATCH 02/18] add documentation --- doc/references/index.rst | 1 + examples/cross_validation/README.txt | 6 + .../plot_instance_hardness_cv.py | 105 ++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 examples/cross_validation/README.txt create mode 100644 examples/cross_validation/plot_instance_hardness_cv.py diff --git a/doc/references/index.rst b/doc/references/index.rst index f5fe3bf53..be102052a 100644 --- a/doc/references/index.rst +++ b/doc/references/index.rst @@ -18,5 +18,6 @@ This is the full API documentation of the `imbalanced-learn` toolbox. miscellaneous pipeline metrics + cross_validation datasets utils diff --git a/examples/cross_validation/README.txt b/examples/cross_validation/README.txt new file mode 100644 index 000000000..ee39d15c3 --- /dev/null +++ b/examples/cross_validation/README.txt @@ -0,0 +1,6 @@ +.. _cross_validation_examples: + +Example using cross validation classes +====================================== + +Cross validation classes to be used for classification problems with imbalanced class distributions diff --git a/examples/cross_validation/plot_instance_hardness_cv.py b/examples/cross_validation/plot_instance_hardness_cv.py new file mode 100644 index 000000000..a48a39d6a --- /dev/null +++ b/examples/cross_validation/plot_instance_hardness_cv.py @@ -0,0 +1,105 @@ +""" +=================================================== +Distribute hard-to-classify datapoint over CV folds +=================================================== + +'Instance hardness' refers to the difficulty to classify an instance. The way +hard-to-classify instances are distributed over train and test sets has +significant effect on the test set performance metrics. In this example we +show how to deal with this problem. We are making the comparison with normal +StratifiedKFold cv splitting. +""" + +# Authors: Frits Hermans, https://fritshermans.github.io +# License: MIT + +# %% +print(__doc__) + +# %% [markdown] +# Create an imbalanced dataset with instance hardness +# --------------------------------------------------- +# +# We will create an imbalanced dataset with using scikit-learn's `make_blobs` +# function and the `make_imbalance` function. The imbalancedness is set to +# 0.1; only 10% of the labels is positive. + + +import numpy as np +from matplotlib import pyplot as plt +from sklearn.datasets import make_blobs + +from imblearn.datasets import make_imbalance + +X, y = make_blobs(n_samples=1000, centers=((-3, 0), (3, 0)), random_state=10) + + +# %% +def sampling_strategy(ratio): + def strategy(y): + return {0: sum(y), 1: int(ratio * sum(y) / (1 - ratio))} + + return strategy + + +X, y = make_imbalance(X, y, sampling_strategy=sampling_strategy(0.1), random_state=10) +plt.scatter(X[:, 0], X[:, 1], c=y) +plt.show() + +# %% +# To introduce instance hardness in our dataset, we flip the labels at the +# boundaries of the feature space +y[np.argsort(X[:, 0])[:5]] = 1 +y[np.argsort(X[:, 0])[-5:]] = 0 +plt.scatter(X[:, 0], X[:, 1], c=y) +plt.show() + +# %% [markdown] +# Compare cross validation scores using StratifiedKFold and InstanceHardnessCV +# ---------------------------------------------------------------------------- +# +# We calculate cross validation scores using `cross_validate` and a +# `LogisticRegression` classifier. We compare the results using a +# `StratifiedKFold` cv splitter and an `InstanceHardnessCV` splitter. +# As we are dealing with an imbalanced classification problem, we +# use `average_precision` for scoring. + +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import StratifiedKFold, cross_validate + +from imblearn.cross_validation import InstanceHardnessCV + +# %% +clf = LogisticRegression() + +# %% +skf_cv = StratifiedKFold(n_splits=5) +skf_result = cross_validate(clf, X, y, cv=skf_cv, scoring="average_precision") + +# %% +ih_cv = InstanceHardnessCV(n_splits=5, random_state=10) +ih_result = cross_validate(clf, X, y, cv=ih_cv) + +# %% +# The boxplot below shows that the `InstanceHardnessCV` splitter results +# in less variation of average precision than `StratifiedKFold` splitter. +# When doing hyperparameter tuning or feature selection using a wrapper +# method (like `RFECV`) this will give more stable results. + +import pandas as pd + +ax = ( + pd.concat( + (pd.DataFrame(skf_result), pd.DataFrame(ih_result)), + axis=1, + keys=["StratifiedKFold", "InstanceHardnessCV"], + ) + .swaplevel(axis="columns")["test_score"] + .plot.box( + color={"whiskers": "black", "medians": "black", "caps": "black"}, vert=False + ) +) +plt.xlabel("Average precision") +_ = plt.title("Test score via cross-validation") +plt.tight_layout() +plt.show() From 4b12063a21aa9d6c4473d26747a15cc6aa64ef36 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Sat, 1 Feb 2025 13:24:22 +0100 Subject: [PATCH 03/18] add cross_validation.rst --- doc/references/cross_validation.rst | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 doc/references/cross_validation.rst diff --git a/doc/references/cross_validation.rst b/doc/references/cross_validation.rst new file mode 100644 index 000000000..3e8889407 --- /dev/null +++ b/doc/references/cross_validation.rst @@ -0,0 +1,23 @@ +.. _under_sampling_ref: + +Cross validation methods +====================== + +.. automodule:: imblearn.cross_validation + :no-members: + :no-inherited-members: + +CV splitters +-------------------- + +.. automodule:: imblearn.cross_validation._cross_validation + :no-members: + :no-inherited-members: + +.. currentmodule:: imblearn.cross_validation + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + InstanceHardnessCV From 0e98d1f94cfff17bce536c9724ea196ef143a061 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Sat, 1 Feb 2025 15:45:30 +0100 Subject: [PATCH 04/18] fix plot_instance_hardness_cv.py --- examples/cross_validation/plot_instance_hardness_cv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cross_validation/plot_instance_hardness_cv.py b/examples/cross_validation/plot_instance_hardness_cv.py index a48a39d6a..6c9c393ef 100644 --- a/examples/cross_validation/plot_instance_hardness_cv.py +++ b/examples/cross_validation/plot_instance_hardness_cv.py @@ -78,7 +78,7 @@ def strategy(y): # %% ih_cv = InstanceHardnessCV(n_splits=5, random_state=10) -ih_result = cross_validate(clf, X, y, cv=ih_cv) +ih_result = cross_validate(clf, X, y, cv=ih_cv, scoring="average_precision") # %% # The boxplot below shows that the `InstanceHardnessCV` splitter results From 2ceea7f371b7cd2caaa4c94cd1781e6cdb3a63c7 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Sun, 2 Feb 2025 10:12:07 +0100 Subject: [PATCH 05/18] add initial documentation --- doc/cross_validation.rst | 18 ++++++++++++++++++ doc/user_guide.rst | 1 + imblearn/cross_validation/_cross_validation.py | 2 ++ 3 files changed, 21 insertions(+) create mode 100644 doc/cross_validation.rst diff --git a/doc/cross_validation.rst b/doc/cross_validation.rst new file mode 100644 index 000000000..58c0300a6 --- /dev/null +++ b/doc/cross_validation.rst @@ -0,0 +1,18 @@ +.. _cross_validation: + +================ +Cross validation +================ + +.. currentmodule:: imblearn.cross_validation + + +.. _instance_hardness_threshold: + +The term instance hardness is used in literature to express the difficulty to +correctly classify an instance. An instance for which the predicted probability +of the true class is low, has large instance hardness. The way these +hard-to-classify instances are distributed over train and test sets in cross +validation, has significant effect on the test set performance metrics. The +`InstanceHardnessCV` splitter distributes samples with large instance hardness +equally over the folds, resulting in more robust cross validation. diff --git a/doc/user_guide.rst b/doc/user_guide.rst index bfa8c00f9..5bb1be673 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -19,6 +19,7 @@ User Guide ensemble.rst miscellaneous.rst metrics.rst + cross_validation.rst common_pitfalls.rst Dataset loading utilities developers_utils.rst diff --git a/imblearn/cross_validation/_cross_validation.py b/imblearn/cross_validation/_cross_validation.py index b5ea76667..a4c4f8f5d 100644 --- a/imblearn/cross_validation/_cross_validation.py +++ b/imblearn/cross_validation/_cross_validation.py @@ -10,6 +10,8 @@ class InstanceHardnessCV: CV splitter that distributes samples with large instance hardness equally over the folds + Read more in the :ref:`User Guide `. + Parameters ---------- n_splits : int, default=5 From 7ef85ff0d4e687acc8c288dc5d9ef4b3a513ec79 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Sun, 2 Feb 2025 10:18:06 +0100 Subject: [PATCH 06/18] add docstrings --- .../cross_validation/_cross_validation.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/imblearn/cross_validation/_cross_validation.py b/imblearn/cross_validation/_cross_validation.py index a4c4f8f5d..1dfc6a8a8 100644 --- a/imblearn/cross_validation/_cross_validation.py +++ b/imblearn/cross_validation/_cross_validation.py @@ -45,6 +45,31 @@ def __init__(self, n_splits=5, clf=None, random_state=None): self.random_state = random_state def split(self, X, y, groups=None): + """ + Generate indices to split data into training and test set. + + Parameters + ---------- + X: array-like of shape (n_samples, n_features) + Training data, where n_samples is the number of samples and + n_features is the number of features. + + y: array-like of shape (n_samples,) + The target variable. + + groups: object + Always ignored, exists for compatibility. + + Yields + ------ + + train: ndarray + The training set indices for that split. + + test: ndarray + The testing set indices for that split. + + """ df = pd.DataFrame(X) features = df.columns df["y"] = y @@ -67,4 +92,24 @@ def split(self, X, y, groups=None): yield train_index, test_index def get_n_splits(self, X=None, y=None, groups=None): + """ + Returns the number of splitting iterations in the cross-validator. + + Parameters + ---------- + X: object + Always ignored, exists for compatibility. + + y: object + Always ignored, exists for compatibility. + + groups: object + Always ignored, exists for compatibility. + + Returns + ------- + n_splits: int + Returns the number of splitting iterations in the cross-validator. + + """ return self.n_splits From cc611e221649965c680c3eaf1b4d1dcfffbc1314 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 08:58:08 +0100 Subject: [PATCH 07/18] fix random seed in unit test --- imblearn/cross_validation/tests/test_instance_hardness.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/imblearn/cross_validation/tests/test_instance_hardness.py b/imblearn/cross_validation/tests/test_instance_hardness.py index 7e3b91fed..a53e7d7f2 100644 --- a/imblearn/cross_validation/tests/test_instance_hardness.py +++ b/imblearn/cross_validation/tests/test_instance_hardness.py @@ -1,8 +1,9 @@ import pytest + from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate -from sklearn.utils._testing import assert_almost_equal +from sklearn.utils._testing import assert_array_equal from imblearn.cross_validation import InstanceHardnessCV @@ -18,10 +19,10 @@ def test_instancehardness_cv(): - ih_cv = InstanceHardnessCV() + ih_cv = InstanceHardnessCV(random_state=10) clf = LogisticRegression(random_state=10) cv_result = cross_validate(clf, X, y, cv=ih_cv) - assert_almost_equal(cv_result["test_score"].std(), 0.005, decimal=3) + assert_array_equal(cv_result['test_score'], [0.965, 0.965, 0.96, 0.965, 0.955]) @pytest.mark.parametrize("n_splits", [2, 3, 4]) From f0c03bb742c0ae974645addcb8d327ba914383e9 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 09:06:54 +0100 Subject: [PATCH 08/18] refactor the way groups are assigned by instance hardness in InstanceHardnessCV --- .../cross_validation/_cross_validation.py | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/imblearn/cross_validation/_cross_validation.py b/imblearn/cross_validation/_cross_validation.py index 1dfc6a8a8..a5e930676 100644 --- a/imblearn/cross_validation/_cross_validation.py +++ b/imblearn/cross_validation/_cross_validation.py @@ -1,7 +1,6 @@ import numpy as np -import pandas as pd from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import StratifiedGroupKFold, cross_val_predict +from sklearn.model_selection import LeaveOneGroupOut, cross_val_predict class InstanceHardnessCV: @@ -70,25 +69,22 @@ def split(self, X, y, groups=None): The testing set indices for that split. """ - df = pd.DataFrame(X) - features = df.columns - df["y"] = y if self.clf is not None: self.clf_ = self.clf else: self.clf_ = RandomForestClassifier( n_jobs=-1, class_weight="balanced", random_state=self.random_state ) - df["proba"] = cross_val_predict( - self.clf_, df[features], df["y"], cv=self.n_splits, method="predict_proba" - )[:, 1] - df["hardness"] = abs(df["y"] - df["proba"]) - df = df.sort_values("hardness") - df["group"] = np.arange(len(df)) % self.n_splits - cv = StratifiedGroupKFold( - n_splits=self.n_splits, shuffle=True, random_state=self.random_state + probas = cross_val_predict( + self.clf_, X, y, cv=self.n_splits, method="predict_proba" ) - for train_index, test_index in cv.split(df[features], df["y"], df["group"]): + # by sorting first on y then on proba rows are ordered by instance hardness + # within the group having the same label + sorted_indices = np.lexsort((probas[:, 1], y)) + groups = np.zeros(len(X), dtype=int) + groups[sorted_indices] = np.arange(len(X)) % self.n_splits + cv = LeaveOneGroupOut() + for train_index, test_index in cv.split(X, y, groups): yield train_index, test_index def get_n_splits(self, X=None, y=None, groups=None): From 018df65a6c329c0cc42d79366bae2f1b8e5880dd Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 09:13:28 +0100 Subject: [PATCH 09/18] simplify plotting code in plot_instance_hardness_cv.py --- .../plot_instance_hardness_cv.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/examples/cross_validation/plot_instance_hardness_cv.py b/examples/cross_validation/plot_instance_hardness_cv.py index 6c9c393ef..dcd0144c0 100644 --- a/examples/cross_validation/plot_instance_hardness_cv.py +++ b/examples/cross_validation/plot_instance_hardness_cv.py @@ -86,20 +86,8 @@ def strategy(y): # When doing hyperparameter tuning or feature selection using a wrapper # method (like `RFECV`) this will give more stable results. -import pandas as pd - -ax = ( - pd.concat( - (pd.DataFrame(skf_result), pd.DataFrame(ih_result)), - axis=1, - keys=["StratifiedKFold", "InstanceHardnessCV"], - ) - .swaplevel(axis="columns")["test_score"] - .plot.box( - color={"whiskers": "black", "medians": "black", "caps": "black"}, vert=False - ) -) -plt.xlabel("Average precision") -_ = plt.title("Test score via cross-validation") +# %% +plt.boxplot([skf_result['test_score'], ih_result['test_score']], + labels=["StratifiedKFold", "InstanceHardnessCV"], vert=False) plt.tight_layout() plt.show() From 2fdca6f202b9a09da70557231c3b1830da934f9b Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 09:18:45 +0100 Subject: [PATCH 10/18] update docstring --- imblearn/cross_validation/_cross_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/cross_validation/_cross_validation.py b/imblearn/cross_validation/_cross_validation.py index a5e930676..db60f0b05 100644 --- a/imblearn/cross_validation/_cross_validation.py +++ b/imblearn/cross_validation/_cross_validation.py @@ -35,7 +35,7 @@ class InstanceHardnessCV: >>> clf = LogisticRegression(random_state=10) >>> cv_result = cross_validate(clf, X, y, cv=ih_cv) >>> print(f"Standard deviation of test_scores: {cv_result['test_score'].std():.3f}") - Standard deviation of test_scores: 0.005 + Standard deviation of test_scores: 0.004 """ def __init__(self, n_splits=5, clf=None, random_state=None): From 0ce2eb3bc4705a6439a21d213ed1a3c623fc465a Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 09:21:09 +0100 Subject: [PATCH 11/18] update 'labels' to 'tick_labels' in boxplot code --- examples/cross_validation/plot_instance_hardness_cv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/cross_validation/plot_instance_hardness_cv.py b/examples/cross_validation/plot_instance_hardness_cv.py index dcd0144c0..3fa2c1dc6 100644 --- a/examples/cross_validation/plot_instance_hardness_cv.py +++ b/examples/cross_validation/plot_instance_hardness_cv.py @@ -88,6 +88,6 @@ def strategy(y): # %% plt.boxplot([skf_result['test_score'], ih_result['test_score']], - labels=["StratifiedKFold", "InstanceHardnessCV"], vert=False) + tick_labels=["StratifiedKFold", "InstanceHardnessCV"], vert=False) plt.tight_layout() plt.show() From a394cf2c4bdccf2e1d1fbef63cf05682a6f0939d Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 09:50:52 +0100 Subject: [PATCH 12/18] rename clf to estimator --- imblearn/cross_validation/_cross_validation.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/imblearn/cross_validation/_cross_validation.py b/imblearn/cross_validation/_cross_validation.py index db60f0b05..f4c2b24d1 100644 --- a/imblearn/cross_validation/_cross_validation.py +++ b/imblearn/cross_validation/_cross_validation.py @@ -16,7 +16,7 @@ class InstanceHardnessCV: n_splits : int, default=5 Number of folds. Must be at least 2. - clf : classifier, default=None + estimator : classifier, default=None Classifier used to determine instance hardness. Defaults to RandomForestClassifier when set to `None` @@ -32,15 +32,15 @@ class InstanceHardnessCV: >>> X, y = make_classification(weights=[0.9, 0.1], class_sep=2, ... n_informative=3, n_redundant=1, flip_y=0.05, n_samples=1000, random_state=10) >>> ih_cv = InstanceHardnessCV(n_splits=5, random_state=10) - >>> clf = LogisticRegression(random_state=10) - >>> cv_result = cross_validate(clf, X, y, cv=ih_cv) + >>> estimator = LogisticRegression(random_state=10) + >>> cv_result = cross_validate(estimator, X, y, cv=ih_cv) >>> print(f"Standard deviation of test_scores: {cv_result['test_score'].std():.3f}") Standard deviation of test_scores: 0.004 """ - def __init__(self, n_splits=5, clf=None, random_state=None): + def __init__(self, n_splits=5, estimator=None, random_state=None): self.n_splits = n_splits - self.clf = clf + self.estimator = estimator self.random_state = random_state def split(self, X, y, groups=None): @@ -69,14 +69,14 @@ def split(self, X, y, groups=None): The testing set indices for that split. """ - if self.clf is not None: - self.clf_ = self.clf + if self.estimator is not None: + self.estimator_ = self.estimator else: - self.clf_ = RandomForestClassifier( + self.estimator_ = RandomForestClassifier( n_jobs=-1, class_weight="balanced", random_state=self.random_state ) probas = cross_val_predict( - self.clf_, X, y, cv=self.n_splits, method="predict_proba" + self.estimator_, X, y, cv=self.n_splits, method="predict_proba" ) # by sorting first on y then on proba rows are ordered by instance hardness # within the group having the same label From d06c5802b38326674d01becc8f1dd211b3f4bd6e Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 10:38:53 +0100 Subject: [PATCH 13/18] change data generation in plot_instance_hardness_cv.py --- .../plot_instance_hardness_cv.py | 34 ++++++------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/examples/cross_validation/plot_instance_hardness_cv.py b/examples/cross_validation/plot_instance_hardness_cv.py index 3fa2c1dc6..1192052ad 100644 --- a/examples/cross_validation/plot_instance_hardness_cv.py +++ b/examples/cross_validation/plot_instance_hardness_cv.py @@ -21,36 +21,24 @@ # --------------------------------------------------- # # We will create an imbalanced dataset with using scikit-learn's `make_blobs` -# function and the `make_imbalance` function. The imbalancedness is set to -# 0.1; only 10% of the labels is positive. +# function and set the imbalancedness to 5%; only 5% of the labels is positive. import numpy as np from matplotlib import pyplot as plt from sklearn.datasets import make_blobs -from imblearn.datasets import make_imbalance - -X, y = make_blobs(n_samples=1000, centers=((-3, 0), (3, 0)), random_state=10) - - -# %% -def sampling_strategy(ratio): - def strategy(y): - return {0: sum(y), 1: int(ratio * sum(y) / (1 - ratio))} - - return strategy - - -X, y = make_imbalance(X, y, sampling_strategy=sampling_strategy(0.1), random_state=10) +X, y = make_blobs(n_samples=[950,50], centers=((-3, 0), (3, 0)), random_state=10) plt.scatter(X[:, 0], X[:, 1], c=y) plt.show() # %% -# To introduce instance hardness in our dataset, we flip the labels at the -# boundaries of the feature space -y[np.argsort(X[:, 0])[:5]] = 1 -y[np.argsort(X[:, 0])[-5:]] = 0 +# To introduce instance hardness in our dataset, we add some hard to classify samples: +X_hard, y_hard = make_blobs(n_samples=10, centers=((3, 0), (-3, 0)), + cluster_std=1, + random_state=10) +X = np.vstack((X, X_hard)) +y = np.hstack((y, y_hard)) plt.scatter(X[:, 0], X[:, 1], c=y) plt.show() @@ -70,14 +58,14 @@ def strategy(y): from imblearn.cross_validation import InstanceHardnessCV # %% -clf = LogisticRegression() +clf = LogisticRegression(random_state=10) # %% -skf_cv = StratifiedKFold(n_splits=5) +skf_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=10) skf_result = cross_validate(clf, X, y, cv=skf_cv, scoring="average_precision") # %% -ih_cv = InstanceHardnessCV(n_splits=5, random_state=10) +ih_cv = InstanceHardnessCV(n_splits=5, estimator=clf, random_state=10) ih_result = cross_validate(clf, X, y, cv=ih_cv, scoring="average_precision") # %% From 38509ddef88c94ba5ff32965b09a764952b1007c Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 10:43:48 +0100 Subject: [PATCH 14/18] describe InstanceHardnessCV in User Guide --- doc/cross_validation.rst | 67 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/doc/cross_validation.rst b/doc/cross_validation.rst index 58c0300a6..e9692e328 100644 --- a/doc/cross_validation.rst +++ b/doc/cross_validation.rst @@ -16,3 +16,70 @@ hard-to-classify instances are distributed over train and test sets in cross validation, has significant effect on the test set performance metrics. The `InstanceHardnessCV` splitter distributes samples with large instance hardness equally over the folds, resulting in more robust cross validation. + +We will discuss instance hardness in this document and explain how to use the +`InstanceHardnessCV` splitter. + +Instance hardness and average precision +======================================= + +Let’s start by creating a dataset to work with. We create a dataset with 5% class +imbalance using scikit-learn’s `make_blobs` function. + + >>> import numpy as np + >>> from matplotlib import pyplot as plt + >>> from sklearn.datasets import make_blobs + >>> from imblearn.datasets import make_imbalance + >>> random_state = 10 + >>> X, y = make_blobs(n_samples=[950, 50], centers=((-3, 0), (3, 0)), + ... random_state=random_state) + >>> plt.scatter(X[:, 0], X[:, 1], c=y) + >>> plt.show() + +.. image:: ./auto_examples/cross_validation/images/sphx_glr_plot_instance_hardness_cv_001.png + :target: ./auto_examples/cross_validation/plot_instance_hardness_cv.html + :align: center + +Now we add some samples with large instance hardness + + >>> X_hard, y_hard = make_blobs(n_samples=10, centers=((3, 0), (-3, 0)), + ... cluster_std=1, + ... random_state=random_state) + >>> X = np.vstack((X, X_hard)) + >>> y = np.hstack((y, y_hard)) + >>> plt.scatter(X[:, 0], X[:, 1], c=y) + >>> plt.show() + +.. image:: ./auto_examples/cross_validation/images/sphx_glr_plot_instance_hardness_cv_002.png + :target: ./auto_examples/cross_validation/plot_instance_hardness_cv.html + :align: center + +Then we take a `LogisticRegressionClassifier` and assess the cross validation +performance using a `StratifiedKFold` cv splitter and the `cross_validate` +function. + + >>> from sklearn.ensemble import LogisticRegressionClassifier + >>> clf = LogisticRegressionClassifier(random_state=random_state) + >>> skf_cv = StratifiedKFold(n_splits=5, shuffle=True, + ... random_state=random_state) + >>> skf_result = cross_validate(clf, X, y, cv=skf_cv, scoring="average_precision") + +Now, we do the same using an `InstanceHardnessCV` splitter. We use provide our +classifier to the splitter to calculate instance hardness and distribute samples +with large instance hardness equally over the folds. + + >>> ih_cv = InstanceHardnessCV(n_splits=5, estimator=clf, + ... random_state=random_state) + >>> ih_result = cross_validate(clf, X, y, cv=ih_cv, scoring="average_precision") + +When we plot the test scores for both cv splitters, we see that the variance using +the `InstanceHardnessCV` splitter is lower than for the `StratifiedKFold` splitter. + + >>> plt.boxplot([skf_result['test_score'], ih_result['test_score']], + ... tick_labels=["StratifiedKFold", "InstanceHardnessCV"], + ... vert=False) + >>> plt.tight_layout() + +.. image:: ./auto_examples/cross_validation/images/sphx_glr_plot_instance_hardness_cv_003.png + :target: ./auto_examples/cross_validation/plot_instance_hardness_cv.html + :align: center \ No newline at end of file From aded9e991752de07bcf99f18d75ac99745c37beb Mon Sep 17 00:00:00 2001 From: fritshermans Date: Wed, 26 Mar 2025 13:22:33 +0100 Subject: [PATCH 15/18] add x label to boxplot --- examples/cross_validation/plot_instance_hardness_cv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/cross_validation/plot_instance_hardness_cv.py b/examples/cross_validation/plot_instance_hardness_cv.py index 1192052ad..8b4858439 100644 --- a/examples/cross_validation/plot_instance_hardness_cv.py +++ b/examples/cross_validation/plot_instance_hardness_cv.py @@ -77,5 +77,6 @@ # %% plt.boxplot([skf_result['test_score'], ih_result['test_score']], tick_labels=["StratifiedKFold", "InstanceHardnessCV"], vert=False) +plt.xlabel('Average precision') plt.tight_layout() plt.show() From 4647a2b9b34c1426050dd077ec94bf15ef5c68f2 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Sat, 29 Mar 2025 17:17:36 +0100 Subject: [PATCH 16/18] fix typo --- examples/cross_validation/plot_instance_hardness_cv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cross_validation/plot_instance_hardness_cv.py b/examples/cross_validation/plot_instance_hardness_cv.py index 8b4858439..cd708bb01 100644 --- a/examples/cross_validation/plot_instance_hardness_cv.py +++ b/examples/cross_validation/plot_instance_hardness_cv.py @@ -1,7 +1,7 @@ """ -=================================================== -Distribute hard-to-classify datapoint over CV folds -=================================================== +==================================================== +Distribute hard-to-classify datapoints over CV folds +==================================================== 'Instance hardness' refers to the difficulty to classify an instance. The way hard-to-classify instances are distributed over train and test sets has From 636dc5b3cc2691bc75e24bac0a011a94f2fea083 Mon Sep 17 00:00:00 2001 From: fritshermans Date: Sat, 29 Mar 2025 17:17:57 +0100 Subject: [PATCH 17/18] explain instance hardness in user guide --- doc/cross_validation.rst | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/doc/cross_validation.rst b/doc/cross_validation.rst index e9692e328..0c51e3e9b 100644 --- a/doc/cross_validation.rst +++ b/doc/cross_validation.rst @@ -22,6 +22,36 @@ We will discuss instance hardness in this document and explain how to use the Instance hardness and average precision ======================================= +Instance hardness is defined as 1 minus the probability of the most probable class: + +.. math:: + + H(x) = 1 - P(\hat{y}|x) + +In this equation :math:`H(x)` is the instance hardness for a sample with features +:math:`x` and :math:`P(\hat{y}|x)` the probability of predicted label :math:`\hat{y}` +given the features. If the model predicts label 0 and gives a `predict_proba` output +of [0.9, 0.1], the probability of the most probable class (0) is 0.9 and the +instance hardness is 1-0.9=0.1. + +Samples with large instance hardness have significant effect on the area under +precision-recall curve, or average precision. Especially samples with label 0 +with large instance hardness (so the model predicts label 1) reduce the average +precision a lot as these points affect the precision-recall curve in the left +where the area is largest; the precision is lowered in the range of low recall +and high thresholds. When doing cross validation, e.g. in case of hyperparameter +tuning or recursive feature elimination, random gathering of these points in +some folds introduce variance in CV results that deteriorates robustness of the +cross validation task. The `InstanceHardnessCV` +splitter aims to distribute the samples with large instance hardness over the +folds in order to reduce undesired variance. Note that one should use this +splitter to make model *selection* tasks robust like hyperparameter tuning and +feature selection but not for model *performance estimation* for which you also +want to know the variance of performance to be expected in production. + + +Create imbalanced dataset with samples with large instance hardness +=================================================================== Let’s start by creating a dataset to work with. We create a dataset with 5% class imbalance using scikit-learn’s `make_blobs` function. @@ -54,6 +84,9 @@ Now we add some samples with large instance hardness :target: ./auto_examples/cross_validation/plot_instance_hardness_cv.html :align: center +Assess cross validation performance variance using InstanceHardnessCV splitter +============================================================================== + Then we take a `LogisticRegressionClassifier` and assess the cross validation performance using a `StratifiedKFold` cv splitter and the `cross_validate` function. @@ -78,6 +111,7 @@ the `InstanceHardnessCV` splitter is lower than for the `StratifiedKFold` splitt >>> plt.boxplot([skf_result['test_score'], ih_result['test_score']], ... tick_labels=["StratifiedKFold", "InstanceHardnessCV"], ... vert=False) + >>> plt.xlabel('Average precision') >>> plt.tight_layout() .. image:: ./auto_examples/cross_validation/images/sphx_glr_plot_instance_hardness_cv_003.png From 1c642cebf04e34e795a9d35804da18598c9f809d Mon Sep 17 00:00:00 2001 From: fritshermans Date: Sat, 29 Mar 2025 17:27:43 +0100 Subject: [PATCH 18/18] remove default random forest as estimator for InstanceHardnessCV --- doc/cross_validation.rst | 2 +- .../cross_validation/plot_instance_hardness_cv.py | 2 +- imblearn/cross_validation/_cross_validation.py | 12 ++++++------ .../cross_validation/tests/test_instance_hardness.py | 7 ++++--- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/doc/cross_validation.rst b/doc/cross_validation.rst index 0c51e3e9b..7759a91b8 100644 --- a/doc/cross_validation.rst +++ b/doc/cross_validation.rst @@ -101,7 +101,7 @@ Now, we do the same using an `InstanceHardnessCV` splitter. We use provide our classifier to the splitter to calculate instance hardness and distribute samples with large instance hardness equally over the folds. - >>> ih_cv = InstanceHardnessCV(n_splits=5, estimator=clf, + >>> ih_cv = InstanceHardnessCV(estimator=clf, n_splits=5, ... random_state=random_state) >>> ih_result = cross_validate(clf, X, y, cv=ih_cv, scoring="average_precision") diff --git a/examples/cross_validation/plot_instance_hardness_cv.py b/examples/cross_validation/plot_instance_hardness_cv.py index cd708bb01..5e7a2202c 100644 --- a/examples/cross_validation/plot_instance_hardness_cv.py +++ b/examples/cross_validation/plot_instance_hardness_cv.py @@ -65,7 +65,7 @@ skf_result = cross_validate(clf, X, y, cv=skf_cv, scoring="average_precision") # %% -ih_cv = InstanceHardnessCV(n_splits=5, estimator=clf, random_state=10) +ih_cv = InstanceHardnessCV(estimator=clf, n_splits=5, random_state=10) ih_result = cross_validate(clf, X, y, cv=ih_cv, scoring="average_precision") # %% diff --git a/imblearn/cross_validation/_cross_validation.py b/imblearn/cross_validation/_cross_validation.py index f4c2b24d1..c8808ebc6 100644 --- a/imblearn/cross_validation/_cross_validation.py +++ b/imblearn/cross_validation/_cross_validation.py @@ -13,13 +13,13 @@ class InstanceHardnessCV: Parameters ---------- + estimator : estimator object + Classifier to be used to estimate instance hardness of the samples. + This classifier should implement `predict_proba`. + n_splits : int, default=5 Number of folds. Must be at least 2. - estimator : classifier, default=None - Classifier used to determine instance hardness. Defaults to - RandomForestClassifier when set to `None` - random_state : int, RandomState instance, default=None Determines random_state for reproducible results across multiple calls. @@ -31,14 +31,14 @@ class InstanceHardnessCV: >>> from sklearn.linear_model import LogisticRegression >>> X, y = make_classification(weights=[0.9, 0.1], class_sep=2, ... n_informative=3, n_redundant=1, flip_y=0.05, n_samples=1000, random_state=10) - >>> ih_cv = InstanceHardnessCV(n_splits=5, random_state=10) >>> estimator = LogisticRegression(random_state=10) + >>> ih_cv = InstanceHardnessCV(estimator=estimator, n_splits=5,random_state=10) >>> cv_result = cross_validate(estimator, X, y, cv=ih_cv) >>> print(f"Standard deviation of test_scores: {cv_result['test_score'].std():.3f}") Standard deviation of test_scores: 0.004 """ - def __init__(self, n_splits=5, estimator=None, random_state=None): + def __init__(self, estimator, n_splits=5, random_state=None): self.n_splits = n_splits self.estimator = estimator self.random_state = random_state diff --git a/imblearn/cross_validation/tests/test_instance_hardness.py b/imblearn/cross_validation/tests/test_instance_hardness.py index a53e7d7f2..096c9259b 100644 --- a/imblearn/cross_validation/tests/test_instance_hardness.py +++ b/imblearn/cross_validation/tests/test_instance_hardness.py @@ -19,13 +19,14 @@ def test_instancehardness_cv(): - ih_cv = InstanceHardnessCV(random_state=10) clf = LogisticRegression(random_state=10) + ih_cv = InstanceHardnessCV(estimator=clf, random_state=10) cv_result = cross_validate(clf, X, y, cv=ih_cv) - assert_array_equal(cv_result['test_score'], [0.965, 0.965, 0.96, 0.965, 0.955]) + assert_array_equal(cv_result['test_score'], [0.975, 0.965, 0.96, 0.955, 0.965]) @pytest.mark.parametrize("n_splits", [2, 3, 4]) def test_instancehardness_cv_n_splits(n_splits): - ih_cv = InstanceHardnessCV(n_splits=n_splits, random_state=10) + clf = LogisticRegression(random_state=10) + ih_cv = InstanceHardnessCV(estimator=clf, n_splits=n_splits, random_state=10) assert ih_cv.get_n_splits() == n_splits