Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ZeroInflatedRegressor.score_samples(...) #680

Merged
merged 6 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions docs/_scripts/meta-models.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,23 +400,31 @@ def false_negatives(mod, x, y):

# --8<-- [start:zero-inflated]
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

from sklego.meta import ZeroInflatedRegressor

np.random.seed(0)
X = np.random.randn(10000, 4)
y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2) # many zeroes here, in about 75% of the cases.
y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2)

zir = ZeroInflatedRegressor(
classifier=RandomForestClassifier(random_state=0),
regressor=RandomForestRegressor(random_state=0)
classifier=ExtraTreesClassifier(random_state=0, max_depth=10),
regressor=ExtraTreesRegressor(random_state=0)
)

print("ZIR (RFC+RFR) r²:", cross_val_score(zir, X, y).mean())
print("RFR r²:", cross_val_score(RandomForestRegressor(random_state=0), X, y).mean())
print("RFR r²:", cross_val_score(ExtraTreesRegressor(random_state=0), X, y).mean())
# --8<-- [end:zero-inflated]


# --8<-- [start:zero-inflated-score-samples]
_ = zir.fit(X, y)
print(f"Predict={zir.predict(X[:5]).round(2)}")
print(f"Scores={zir.score_samples(X[:5]).round(2)}")
# --8<-- [end:zero-inflated-score-samples]

# --8<-- [start:outlier-classifier]
import numpy as np
from sklego.meta.outlier_classifier import OutlierClassifier
Expand Down
26 changes: 22 additions & 4 deletions docs/user-guide/meta-models.md
Original file line number Diff line number Diff line change
Expand Up @@ -374,8 +374,8 @@ Sure, you can get regions where you are close to zero, but modelling an output o

What we can do circumvent these problems is the following:

1. Train a classifier to tell us whether the target is zero, or not.
2. Train a regressor on all samples with a non-zero target.
1. Train a **classifier** to tell us whether the target is zero, or not.
2. Train a **regressor** on all samples with a non-zero target.

By putting these two together in an obvious way, we get the [`ZeroInflatedRegressor`][zero-inflated-api]. You can use it like this:

Expand All @@ -384,8 +384,26 @@ By putting these two together in an obvious way, we get the [`ZeroInflatedRegres
```

```console
ZIR (RFC+RFR) r²: 0.8992404366385873
RFR r²: 0.8516522752031502
ZIR (RFC+RFR) r²: 0.8579468997736154
RFR r²: 0.7691291933110612
```

If the underlying classifier is able to predict the _probability_ of a sample to be zero (i.e. it implements a `predict_proba` method), then the `ZeroInflatedRegressor` can be used to predict the probability of a sample being non-zero _times_doc the expected value of such sample.

This quantity is sometimes called _risk estimate_ or _expected impact_, however, to adhere to scikit-learn convention, we made it accessible via the `score_samples` method.

!!! warning "About `predict_proba`"
The `predict_proba` method of the classifier does not always return actual probabilities.

For this reason if you want to use the `score_samples` method, it is recommended to train with a classifier wrapped by the [`CalibratedClassifierCV`][calibrated-classifier-api] class from scikit-learn to calibrate the probabilities.

```py title="score_samples"
--8<-- "docs/_scripts/meta-models.py:zero-inflated-score-samples"
```

```console
Predict=[4.91 0. 0. 0.05 0. ]
Scores=[3.73 0. 0.11 0.03 0.06]
```

## Outlier Classifier
Expand Down
56 changes: 46 additions & 10 deletions sklego/meta/zero_inflated_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from sklearn.base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone, is_classifier, is_regressor
from sklearn.exceptions import NotFittedError
from sklearn.utils.metaestimators import available_if
from sklearn.utils.validation import _check_sample_weight, check_array, check_is_fitted, check_X_y


Expand All @@ -12,9 +13,9 @@ class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin):

`ZeroInflatedRegressor` consists of a classifier and a regressor.

- The classifier's task is to find of if the target is zero or not.
- The regressor's task is to output a (usually positive) prediction whenever the classifier indicates that the
there should be a non-zero prediction.
- The classifier's task is to find of if the target is zero or not.
- The regressor's task is to output a (usually positive) prediction whenever the classifier indicates that the
there should be a non-zero prediction.

The regressor is only trained on examples where the target is non-zero, which makes it easier for it to focus.

Expand Down Expand Up @@ -46,17 +47,17 @@ class ZeroInflatedRegressor(BaseEstimator, RegressorMixin, MetaEstimatorMixin):
np.random.seed(0)
X = np.random.randn(10000, 4)
y = ((X[:, 0]>0) & (X[:, 1]>0)) * np.abs(X[:, 2] * X[:, 3]**2)

model = ZeroInflatedRegressor(
classifier=ExtraTreesClassifier(random_state=0),
classifier=ExtraTreesClassifier(random_state=0, max_depth=10),
regressor=ExtraTreesRegressor(random_state=0)
)

model.fit(X, y)
# ZeroInflatedRegressor(classifier=ExtraTreesClassifier(random_state=0),
# regressor=ExtraTreesRegressor(random_state=0))
).fit(X, y)

model.predict(X)[:5]
model.predict(X[:5])
# array([4.91483294, 0. , 0. , 0.04941909, 0. ])

model.score_samples(X[:5]).round(2)
# array([3.73, 0. , 0.11, 0.03, 0.06])
```
"""

Expand Down Expand Up @@ -165,3 +166,38 @@ def predict(self, X):
output[non_zero_indices] = self.regressor_.predict(X[non_zero_indices])

return output

@available_if(lambda self: hasattr(self.classifier_, "predict_proba"))
def score_samples(self, X):
r"""Predict risk estimate of `X` as the probability of `X` to not be zero times the expected value of `X`:

$$\text{score_sample(X)} = (1-P(X=0)) \cdot E[X]$$

where:

- $P(X=0)$ is calculated using the `.predict_proba()` method of the underlying classifier.
- $E[X]$ is the regressor prediction on `X`.

!!! info

This method requires the underlying classifier to implement `.predict_proba()` method.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to predict.

Returns
-------
array-like of shape (n_samples,)
The predicted risk.
"""

check_is_fitted(self)
X = check_array(X)
self._check_n_features(X, reset=True)

non_zero_proba = self.classifier_.predict_proba(X)[:, 1]
expected_impact = self.regressor_.predict(X)

return non_zero_proba * expected_impact
45 changes: 45 additions & 0 deletions tests/test_meta/test_zero_inflated_regressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pytest
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.utils.estimator_checks import parametrize_with_checks

Expand Down Expand Up @@ -81,3 +82,47 @@ def test_wrong_estimators_exceptions():
with pytest.raises(ValueError, match="`regressor` has to be a regressor."):
zir = ZeroInflatedRegressor(ExtraTreesClassifier(), ExtraTreesClassifier())
zir.fit(X, y)


def approx_lte(x, y):
return ((x <= y) | np.isclose(x, y)).all()


def approx_gte(x, y):
return ((x >= y) | np.isclose(x, y)).all()


def test_score_samples():
np.random.seed(0)
X = np.random.randn(1_000, 4)
y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs(X[:, 2] * X[:, 3] ** 2)

zir = ZeroInflatedRegressor(
classifier=ExtraTreesClassifier(max_depth=20, random_state=0, n_jobs=-1),
regressor=ExtraTreesRegressor(max_depth=20, random_state=0, n_jobs=-1),
).fit(X, y)

scores = zir.score_samples(X)
preds = zir.predict(X)

pred_is_non_zero = zir.classifier_.predict(X)

# Where the classifier prediction is non-zero, then the value is multiplied by something less than 1.
assert approx_lte(scores[pred_is_non_zero], preds[pred_is_non_zero])
# Where the classifier prediction is zero, then the score is by something greater than 0.
assert approx_gte(scores[~pred_is_non_zero], preds[~pred_is_non_zero])

def test_no_predict_proba():

np.random.seed(0)
X = np.random.randn(1_000, 4)
y = ((X[:, 0] > 0) & (X[:, 1] > 0)) * np.abs(X[:, 2] * X[:, 3] ** 2)

zir = ZeroInflatedRegressor(
classifier=RidgeClassifier(),
regressor=ExtraTreesRegressor(max_depth=20, random_state=0, n_jobs=-1),
).fit(X, y)

with pytest.raises(AttributeError, match="This 'ZeroInflatedRegressor' has no attribute 'score_samples'"):
zir.score_samples(X)
Comment on lines +126 to +127
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering if this is the best error message to give to the user - automatically generated from available_if decorator


Loading