Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
FBruzzesi committed Nov 19, 2024
1 parent 1c66894 commit e7baf9a
Show file tree
Hide file tree
Showing 27 changed files with 233 additions and 61 deletions.
4 changes: 4 additions & 0 deletions sklego/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import re
import sys

if sys.version_info >= (3, 8):
from importlib import metadata
else:
import importlib_metadata as metadata


__title__ = "sklego"
__version__ = metadata.version("scikit-lego")

SKLEARN_VERSION = tuple(int(re.sub(r"\D", "", str(v))) for v in metadata.version("scikit-learn").split("."))
41 changes: 36 additions & 5 deletions sklego/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@

import numpy as np
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

from sklego import SKLEARN_VERSION

class TrainOnlyTransformerMixin(TransformerMixin):

class TrainOnlyTransformerMixin(TransformerMixin, BaseEstimator):
"""Mixin class for transformers that can handle training and test data differently.
This mixin allows using a separate function for transforming training and test data.
Expand Down Expand Up @@ -79,9 +81,9 @@ def fit(self, X, y=None):
The fitted transformer.
"""
if y is None:
check_array(X, estimator=self)
validate_data(self, X)
else:
check_X_y(X, y, estimator=self, multi_output=True)
validate_data(self, X, y, multi_output=True)
self.X_hash_ = self._hash(X)
self.n_features_in_ = X.shape[1]
return self
Expand Down Expand Up @@ -145,7 +147,7 @@ def transform(self, X, y=None):
If the input dimension does not match the training dimension.
"""
check_is_fitted(self, ["X_hash_", "n_features_in_"])
check_array(X, estimator=self)
X = validate_data(self, X, reset=False)

if X.shape[1] != self.n_features_in_:
raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}")
Expand Down Expand Up @@ -339,3 +341,32 @@ def sliding_window(sequence, window_size, step_size):
```
"""
return (sequence[pos : pos + window_size] for pos in range(0, len(sequence), step_size))


def validate_data(
estimator,
X="no_validation",
y="no_validation",
reset=True,
validate_separately=False,
skip_check_array=False,
**check_params,
):
if SKLEARN_VERSION >= (1, 6):
from sklearn.utils.validation import validate_data

return validate_data(
estimator,
X=X,
y=y,
reset=reset,
validate_separately=validate_separately,
skip_check_array=skip_check_array,
**check_params,
)

else:
if y == "no_validation":
return check_array(arr=X, estimator=estimator, **check_params)
else:
return check_X_y(X=X, y=y, estimator=estimator, **check_params)
10 changes: 6 additions & 4 deletions sklego/decomposition/pca_reconstruction.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import numpy as np
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.decomposition import PCA
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted

from sklego.common import validate_data

class PCAOutlierDetection(BaseEstimator, OutlierMixin):

class PCAOutlierDetection(OutlierMixin, BaseEstimator):
"""`PCAOutlierDetection` is an outlier detector based on the reconstruction error from PCA.
If the difference between original and reconstructed data is larger than the `threshold`, the point is
Expand Down Expand Up @@ -94,7 +96,7 @@ def fit(self, X, y=None):
ValueError
If `threshold` is `None`.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X, dtype=FLOAT_DTYPES)
if not self.threshold:
raise ValueError("The `threshold` value cannot be `None`.")

Expand Down Expand Up @@ -157,7 +159,7 @@ def predict(self, X):
array-like of shape (n_samples,)
The predicted data. 1 for inliers, -1 for outliers.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["pca_", "offset_"])
result = np.ones(X.shape[0])
result[self.difference(X) > self.threshold] = -1
Expand Down
23 changes: 18 additions & 5 deletions sklego/decomposition/umap_reconstruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@

import numpy as np
from sklearn.base import BaseEstimator, OutlierMixin
from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted

from sklego.common import validate_data

class UMAPOutlierDetection(BaseEstimator, OutlierMixin):

class UMAPOutlierDetection(OutlierMixin, BaseEstimator):
"""`UMAPOutlierDetection` is an outlier detector based on the reconstruction error from UMAP.
If the difference between original and reconstructed data is larger than the `threshold`, the point is
Expand Down Expand Up @@ -100,9 +102,9 @@ def fit(self, X, y=None):
- If `n_components` is less than 2.
- If `threshold` is `None`.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X, dtype=FLOAT_DTYPES)
if y is not None:
y = check_array(y, estimator=self, ensure_2d=False)
y = validate_data(self, y, ensure_2d=False)

if not self.threshold:
raise ValueError("The `threshold` value cannot be `None`.")
Expand Down Expand Up @@ -133,6 +135,7 @@ def difference(self, X):
The calculated difference.
"""
check_is_fitted(self, ["umap_", "offset_"])

reduced = self.umap_.transform(X)
diff = np.sum(np.abs(self.umap_.inverse_transform(reduced) - X), axis=1)
if self.variant == "relative":
Expand All @@ -155,7 +158,7 @@ def predict(self, X):
array-like of shape (n_samples,)
The predicted data. 1 for inliers, -1 for outliers.
"""
X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
X = validate_data(self, X, dtype=FLOAT_DTYPES)
check_is_fitted(self, ["umap_", "offset_"])
result = np.ones(X.shape[0])
result[self.difference(X) > self.threshold] = -1
Expand All @@ -172,3 +175,13 @@ def score_samples(self, X):

def _more_tags(self):
return {"non_deterministic": True}

def __sklearn_tags__(self):
from sklego import SKLEARN_VERSION

if SKLEARN_VERSION >= (1, 6):
tags = super().__sklearn_tags__()
tags.non_deterministic = True
return tags
else:
pass
15 changes: 13 additions & 2 deletions sklego/dummy.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
)


class RandomRegressor(BaseEstimator, RegressorMixin):
class RandomRegressor(RegressorMixin, BaseEstimator):
"""A `RandomRegressor` makes random predictions only based on the `y` value that is seen.
The goal is that such a regressor can be used for benchmarking. It _should be_ easily beatable.
Expand Down Expand Up @@ -101,7 +101,7 @@ def predict(self, X):

X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
if X.shape[1] != self.n_features_in_:
raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.dim_}")
raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}")

if self.strategy == "normal":
return rs.normal(self.mu_, self.sigma_, X.shape[0])
Expand All @@ -127,3 +127,14 @@ def allowed_strategies(self):

def _more_tags(self):
return {"poor_score": True, "non_deterministic": True}

def __sklearn_tags__(self):
from sklego import SKLEARN_VERSION

if SKLEARN_VERSION >= (1, 6):
tags = super().__sklearn_tags__()
tags.non_deterministic = True
tags.regressor_tags.poor_score = True
return tags
else:
pass
7 changes: 5 additions & 2 deletions sklego/feature_selection/mrmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from sklearn.base import BaseEstimator
from sklearn.feature_selection import f_classif, f_regression
from sklearn.feature_selection._base import SelectorMixin
from sklearn.utils.validation import check_is_fitted, check_X_y
from sklearn.utils.validation import check_is_fitted

from sklego.common import validate_data


def _redundancy_pearson(X, selected, left):
Expand Down Expand Up @@ -201,7 +203,8 @@ def fit(self, X, y):
k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1
"""
X, y = check_X_y(X, y, dtype="numeric", y_numeric=True)
X, y = validate_data(self, X, y, dtype="numeric", y_numeric=True)

self._y_dtype = y.dtype

relevance = self._get_relevance
Expand Down
29 changes: 20 additions & 9 deletions sklego/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)


class LowessRegression(BaseEstimator, RegressorMixin):
class LowessRegression(RegressorMixin, BaseEstimator):
"""`LowessRegression` estimator: LOWESS (Locally Weighted Scatterplot Smoothing) is a type of
[local regression](https://en.wikipedia.org/wiki/Local_regression).
Expand Down Expand Up @@ -145,7 +145,7 @@ def predict(self, X):
return results


class ProbWeightRegression(BaseEstimator, RegressorMixin):
class ProbWeightRegression(RegressorMixin, BaseEstimator):
"""`ProbWeightRegression` assumes that all input signals in `X` need to be reweighted with weights that sum up to
one in order to predict `y`.
Expand Down Expand Up @@ -266,7 +266,7 @@ def coefs_(self):
return self.coef_


class DeadZoneRegressor(BaseEstimator, RegressorMixin):
class DeadZoneRegressor(RegressorMixin, BaseEstimator):
r"""The `DeadZoneRegressor` estimator implements a regression model that incorporates a _dead zone effect_ for
improving the robustness of regression predictions.
Expand Down Expand Up @@ -470,7 +470,7 @@ def allowed_effects(self):
return self._ALLOWED_EFFECTS


class _FairClassifier(BaseEstimator, LinearClassifierMixin):
class _FairClassifier(LinearClassifierMixin, BaseEstimator):
"""Base class for fair classifiers that address sensitive attribute fairness.
This base class provides a foundation for fair classifiers that aim to mitigate bias and discrimination by taking
Expand Down Expand Up @@ -671,8 +671,18 @@ def decision_function(self, X):
def _more_tags(self):
return {"poor_score": True}

def __sklearn_tags__(self):
from sklego import SKLEARN_VERSION

class DemographicParityClassifier(BaseEstimator, LinearClassifierMixin):
if SKLEARN_VERSION >= (1, 6):
tags = super().__sklearn_tags__()
tags.classifier_tags.poor_score = True
return tags
else:
pass


class DemographicParityClassifier(LinearClassifierMixin, BaseEstimator):
r"""`DemographicParityClassifier` is a logistic regression classifier which can be constrained on demographic
parity (p% score).
Expand Down Expand Up @@ -790,7 +800,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs):
return []


class EqualOpportunityClassifier(BaseEstimator, LinearClassifierMixin):
class EqualOpportunityClassifier(LinearClassifierMixin, BaseEstimator):
r"""`EqualOpportunityClassifier` is a logistic regression classifier which can be constrained on equal opportunity
score.
Expand Down Expand Up @@ -904,7 +914,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs):
return []


class BaseScipyMinimizeRegressor(BaseEstimator, RegressorMixin, ABC):
class BaseScipyMinimizeRegressor(RegressorMixin, BaseEstimator, ABC):
"""Abstract base class for regressors relying on Scipy's
[minimize method](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html) to minimize a
(custom) loss function.
Expand Down Expand Up @@ -960,8 +970,6 @@ def __init__(
self.fit_intercept = fit_intercept
self.copy_X = copy_X
self.positive = positive
if method not in ("SLSQP", "TNC", "L-BFGS-B"):
raise ValueError(f'method should be one of "SLSQP", "TNC", "L-BFGS-B", ' f"got {method} instead")
self.method = method

@abstractmethod
Expand Down Expand Up @@ -1011,6 +1019,9 @@ def fit(self, X, y, sample_weight=None):
self : BaseScipyMinimizeRegressor
Fitted linear model.
"""
if self.method not in {"SLSQP", "TNC", "L-BFGS-B"}:
msg = f"method should be one of 'SLSQP', 'TNC', 'L-BFGS-B', got {self.method} instead"
raise ValueError(msg)
X_, grad_loss, loss = self._prepare_inputs(X, sample_weight, y)

d = X_.shape[1] - self.n_features_in_ # This is either zero or one.
Expand Down
2 changes: 1 addition & 1 deletion sklego/meta/confusion_balancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from sklego.base import ProbabilisticClassifier


class ConfusionBalancer(BaseEstimator, MetaEstimatorMixin, ClassifierMixin):
class ConfusionBalancer(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
r"""The `ConfusionBalancer` estimator attempts to give it's child estimator a more balanced output by learning from
the confusion matrix during training.
Expand Down
13 changes: 11 additions & 2 deletions sklego/meta/decay_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sklego.meta._decay_utils import exponential_decay, linear_decay, sigmoid_decay, stepwise_decay


class DecayEstimator(BaseEstimator, MetaEstimatorMixin):
class DecayEstimator(MetaEstimatorMixin, BaseEstimator):
"""Morphs an estimator such that the training weights can be adapted to ensure that points that are far away have
less weight.
Expand Down Expand Up @@ -97,10 +97,16 @@ def _is_classifier(self):
"""Checks if the wrapped estimator is a classifier."""
return any(["ClassifierMixin" in p.__name__ for p in type(self.model).__bases__])

def _is_regressor(self):
"""Checks if the wrapped estimator is a regressor."""
return any(["RegressorMixin" in p.__name__ for p in type(self.model).__bases__])

@property
def _estimator_type(self):
"""Computes `_estimator_type` dynamically from the wrapped model."""
return self.model._estimator_type
from sklego import SKLEARN_VERSION

return self.model.__sklearn_tags__().estimator_type if SKLEARN_VERSION >= (1, 6) else self.model._estimator_type

def fit(self, X, y):
"""Fit the underlying estimator on the training data `X` and `y` using the calculated sample weights.
Expand Down Expand Up @@ -165,3 +171,6 @@ def predict(self, X):
def score(self, X, y):
"""Alias for `.score()` method of the underlying estimator."""
return self.estimator_.score(X, y)

def __sklearn_tags__(self):
return self.model.__sklearn_tags__()
14 changes: 12 additions & 2 deletions sklego/meta/grouped_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,18 @@ def _estimator_type(self):
def _more_tags(self):
return {"allow_nan": True}

def __sklearn_tags__(self):
from sklego import SKLEARN_VERSION

class GroupedRegressor(GroupedPredictor, RegressorMixin):
if SKLEARN_VERSION >= (1, 6):
tags = super().__sklearn_tags__()
tags.input_tags.allow_nan = True
return tags
else:
pass


class GroupedRegressor(RegressorMixin, GroupedPredictor):
"""`GroupedRegressor` is a meta-estimator that fits a separate regressor for each group in the input data.
Its spec is the same as [`GroupedPredictor`][sklego.meta.grouped_predictor.GroupedPredictor] but it is available
Expand Down Expand Up @@ -439,7 +449,7 @@ def fit(self, X, y):
return super().fit(X, y)


class GroupedClassifier(GroupedPredictor, ClassifierMixin):
class GroupedClassifier(ClassifierMixin, GroupedPredictor):
"""`GroupedClassifier` is a meta-estimator that fits a separate classifier for each group in the input data.
Its equivalent to [`GroupedPredictor`][sklego.meta.grouped_predictor.GroupedPredictor] with `shrinkage=None`
Expand Down
Loading

0 comments on commit e7baf9a

Please sign in to comment.