koaning · FBruzzesi · Jul 8, 2024 · Jun 15, 2024 · Jun 15, 2024 · Jun 20, 2024
diff --git a/docs/_scripts/meta-models.py b/docs/_scripts/meta-models.py
@@ -400,23 +400,31 @@ def false_negatives(mod, x, y):
 
 # --8<-- [start:zero-inflated]
 import numpy as np
-from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
 from sklearn.model_selection import cross_val_score
+
 from sklego.meta import ZeroInflatedRegressor
 
 np.random.seed(0)
 X = np.random.randn(10000, 4)
-y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2) # many zeroes here, in about 75% of the cases.
+y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2)
 
 zir = ZeroInflatedRegressor(
-    classifier=RandomForestClassifier(random_state=0),
-    regressor=RandomForestRegressor(random_state=0)
+    classifier=ExtraTreesClassifier(random_state=0, max_depth=10),
+    regressor=ExtraTreesRegressor(random_state=0)
 )
 
 print("ZIR (RFC+RFR) r²:", cross_val_score(zir, X, y).mean())
-print("RFR r²:", cross_val_score(RandomForestRegressor(random_state=0), X, y).mean())
+print("RFR r²:", cross_val_score(ExtraTreesRegressor(random_state=0), X, y).mean())
 # --8<-- [end:zero-inflated]
 
+
+# --8<-- [start:zero-inflated-score-samples]
+_ = zir.fit(X, y)
+print(f"Predict={zir.predict(X[:5]).round(2)}")
+print(f"Scores={zir.score_samples(X[:5]).round(2)}")
+# --8<-- [end:zero-inflated-score-samples]
+
 # --8<-- [start:outlier-classifier]
 import numpy as np
 from sklego.meta.outlier_classifier import OutlierClassifier

diff --git a/docs/user-guide/meta-models.md b/docs/user-guide/meta-models.md
@@ -374,8 +374,8 @@ Sure, you can get regions where you are close to zero, but modelling an output o
 
 What we can do circumvent these problems is the following:
 
-1. Train a classifier to tell us whether the target is zero, or not.
-2. Train a regressor on all samples with a non-zero target.
+1. Train a **classifier** to tell us whether the target is zero, or not.
+2. Train a **regressor** on all samples with a non-zero target.
 
 By putting these two together in an obvious way, we get the [`ZeroInflatedRegressor`][zero-inflated-api]. You can use it like this:
 
@@ -384,8 +384,26 @@ By putting these two together in an obvious way, we get the [`ZeroInflatedRegres
 ```
 
 ```console
-ZIR (RFC+RFR) r²: 0.8992404366385873
-RFR r²: 0.8516522752031502
+ZIR (RFC+RFR) r²: 0.8579468997736154
+RFR r²: 0.7691291933110612
+```
+
+If the underlying classifier is able to predict the _probability_ of a sample to be zero (i.e. it implements a `predict_proba` method), then the `ZeroInflatedRegressor` can be used to predict the probability of a sample being non-zero _times_doc the expected value of such sample.
+
+This quantity is sometimes called _risk estimate_ or _expected impact_, however, to adhere to scikit-learn convention, we made it accessible via the  `score_samples` method.
+
+!!! warning "About `predict_proba`"
+    The `predict_proba` method of the classifier does not always return actual probabilities.
+
+    For this reason if you want to use the `score_samples` method, it is recommended to train with a classifier wrapped by the [`CalibratedClassifierCV`][calibrated-classifier-api] class from scikit-learn to calibrate the probabilities.
+
+```py title="score_samples"
+--8<-- "docs/_scripts/meta-models.py:zero-inflated-score-samples"
+```
+
+```console
+Predict=[4.91 0.   0.   0.05 0.  ]
+Scores=[3.73 0.   0.11 0.03 0.06]
 ```
 
 ## Outlier Classifier

diff --git a/sklego/meta/zero_inflated_regressor.py b/sklego/meta/zero_inflated_regressor.py
@@ -4,6 +4,7 @@
 import numpy as np
 from sklearn.base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone, is_classifier, is_regressor
 from sklearn.exceptions import NotFittedError
+from sklearn.utils.metaestimators import available_if
 from sklearn.utils.validation import _check_sample_weight, check_array, check_is_fitted, check_X_y
 
 
@@ -12,9 +13,9 @@ class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin):
 
     `ZeroInflatedRegressor` consists of a classifier and a regressor.
 
-        - The classifier's task is to find of if the target is zero or not.
-        - The regressor's task is to output a (usually positive) prediction whenever the classifier indicates that the
-        there should be a non-zero prediction.
+    - The classifier's task is to find of if the target is zero or not.
+    - The regressor's task is to output a (usually positive) prediction whenever the classifier indicates that the
+    there should be a non-zero prediction.
 
     The regressor is only trained on examples where the target is non-zero, which makes it easier for it to focus.
 
@@ -46,17 +47,17 @@ class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin):
     np.random.seed(0)
     X = np.random.randn(10000, 4)
     y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2)
+
     model = ZeroInflatedRegressor(
-        classifier=ExtraTreesClassifier(random_state=0),
+        classifier=ExtraTreesClassifier(random_state=0, max_depth=10),
         regressor=ExtraTreesRegressor(random_state=0)
-        )
-
-    model.fit(X, y)
-    # ZeroInflatedRegressor(classifier=ExtraTreesClassifier(random_state=0),
-    #                       regressor=ExtraTreesRegressor(random_state=0))
+    ).fit(X, y)
 
-    model.predict(X)[:5]
+    model.predict(X[:5])
     # array([4.91483294, 0.        , 0.        , 0.04941909, 0.        ])
+
+    model.score_samples(X[:5]).round(2)
+    # array([3.73, 0.  , 0.11, 0.03, 0.06])
     ```
     """
 
@@ -165,3 +166,38 @@ def predict(self, X):
             output[non_zero_indices] = self.regressor_.predict(X[non_zero_indices])
 
         return output
+
+    @available_if(lambda self: hasattr(self.classifier_, "predict_proba"))
+    def score_samples(self, X):
+        r"""Predict risk estimate of `X` as the probability of `X` to not be zero times the expected value of `X`:
+
+        $$\text{score_sample(X)} = (1-P(X=0)) \cdot E[X]$$
+
+        where:
+
+        - $P(X=0)$ is calculated using the `.predict_proba()` method of the underlying classifier.
+        - $E[X]$ is the regressor prediction on `X`.
+
+        !!! info
+
+            This method requires the underlying classifier to implement `.predict_proba()` method.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The data to predict.
+
+        Returns
+        -------
+        array-like of shape (n_samples,)
+            The predicted risk.
+        """
+
+        check_is_fitted(self)
+        X = check_array(X)
+        self._check_n_features(X, reset=True)
+
+        non_zero_proba = self.classifier_.predict_proba(X)[:, 1]
+        expected_impact = self.regressor_.predict(X)
+
+        return non_zero_proba * expected_impact
diff --git a/tests/test_meta/test_zero_inflated_regressor.py b/tests/test_meta/test_zero_inflated_regressor.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
+from sklearn.linear_model import RidgeClassifier
 from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
@@ -81,3 +82,47 @@ def test_wrong_estimators_exceptions():
     with pytest.raises(ValueError, match="`regressor` has to be a regressor."):
         zir = ZeroInflatedRegressor(ExtraTreesClassifier(), ExtraTreesClassifier())
         zir.fit(X, y)
+
+
+def approx_lte(x, y):
+    return ((x <= y) | np.isclose(x, y)).all()
+
+
+def approx_gte(x, y):
+    return ((x >= y) | np.isclose(x, y)).all()
+
+
+def test_score_samples():
+    np.random.seed(0)
+    X = np.random.randn(1_000, 4)
+    y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs(X[:, 2] * X[:, 3] ** 2)
+
+    zir = ZeroInflatedRegressor(
+        classifier=ExtraTreesClassifier(max_depth=20, random_state=0, n_jobs=-1),
+        regressor=ExtraTreesRegressor(max_depth=20, random_state=0, n_jobs=-1),
+    ).fit(X, y)
+
+    scores = zir.score_samples(X)
+    preds = zir.predict(X)
+
+    pred_is_non_zero = zir.classifier_.predict(X)
+
+    # Where the classifier prediction is non-zero, then the value is multiplied by something less than 1.
+    assert approx_lte(scores[pred_is_non_zero], preds[pred_is_non_zero])
+    # Where the classifier prediction is zero, then the score is by something greater than 0.
+    assert approx_gte(scores[~pred_is_non_zero], preds[~pred_is_non_zero])
+
+def test_no_predict_proba():
+
+    np.random.seed(0)
+    X = np.random.randn(1_000, 4)
+    y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs(X[:, 2] * X[:, 3] ** 2)
+
+    zir = ZeroInflatedRegressor(
+        classifier=RidgeClassifier(),
+        regressor=ExtraTreesRegressor(max_depth=20, random_state=0, n_jobs=-1),
+    ).fit(X, y)
+
+    with pytest.raises(AttributeError, match="This 'ZeroInflatedRegressor' has no attribute 'score_samples'"):
+        zir.score_samples(X)
+