WIP

koaning · Nov 19, 2024 · e7baf9a · e7baf9a
1 parent 1c66894
commit e7baf9a
Show file tree

Hide file tree

Showing 27 changed files with 233 additions and 61 deletions.
diff --git a/sklego/__init__.py b/sklego/__init__.py
@@ -1,9 +1,13 @@
+import re
 import sys
 
 if sys.version_info >= (3, 8):
     from importlib import metadata
 else:
     import importlib_metadata as metadata
 
+
 __title__ = "sklego"
 __version__ = metadata.version("scikit-lego")
+
+SKLEARN_VERSION = tuple(int(re.sub(r"\D", "", str(v))) for v in metadata.version("scikit-learn").split("."))
diff --git a/sklego/common.py b/sklego/common.py
@@ -4,11 +4,13 @@
 
 import numpy as np
 import pandas as pd
-from sklearn.base import TransformerMixin
+from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
 
+from sklego import SKLEARN_VERSION
 
-class TrainOnlyTransformerMixin(TransformerMixin):
+
+class TrainOnlyTransformerMixin(TransformerMixin, BaseEstimator):
     """Mixin class for transformers that can handle training and test data differently.
 
     This mixin allows using a separate function for transforming training and test data.
@@ -79,9 +81,9 @@ def fit(self, X, y=None):
             The fitted transformer.
         """
         if y is None:
-            check_array(X, estimator=self)
+            validate_data(self, X)
         else:
-            check_X_y(X, y, estimator=self, multi_output=True)
+            validate_data(self, X, y, multi_output=True)
         self.X_hash_ = self._hash(X)
         self.n_features_in_ = X.shape[1]
         return self
@@ -145,7 +147,7 @@ def transform(self, X, y=None):
             If the input dimension does not match the training dimension.
         """
         check_is_fitted(self, ["X_hash_", "n_features_in_"])
-        check_array(X, estimator=self)
+        X = validate_data(self, X, reset=False)
 
         if X.shape[1] != self.n_features_in_:
             raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}")
@@ -339,3 +341,32 @@ def sliding_window(sequence, window_size, step_size):
     ```
     """
     return (sequence[pos : pos + window_size] for pos in range(0, len(sequence), step_size))
+
+
+def validate_data(
+    estimator,
+    X="no_validation",
+    y="no_validation",
+    reset=True,
+    validate_separately=False,
+    skip_check_array=False,
+    **check_params,
+):
+    if SKLEARN_VERSION >= (1, 6):
+        from sklearn.utils.validation import validate_data
+
+        return validate_data(
+            estimator,
+            X=X,
+            y=y,
+            reset=reset,
+            validate_separately=validate_separately,
+            skip_check_array=skip_check_array,
+            **check_params,
+        )
+
+    else:
+        if y == "no_validation":
+            return check_array(arr=X, estimator=estimator, **check_params)
+        else:
+            return check_X_y(X=X, y=y, estimator=estimator, **check_params)
diff --git a/sklego/decomposition/pca_reconstruction.py b/sklego/decomposition/pca_reconstruction.py
@@ -1,10 +1,12 @@
 import numpy as np
 from sklearn.base import BaseEstimator, OutlierMixin
 from sklearn.decomposition import PCA
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
 
+from sklego.common import validate_data
 
-class PCAOutlierDetection(BaseEstimator, OutlierMixin):
+
+class PCAOutlierDetection(OutlierMixin, BaseEstimator):
     """`PCAOutlierDetection` is an outlier detector based on the reconstruction error from PCA.
 
     If the difference between original and reconstructed data is larger than the `threshold`, the point is
@@ -94,7 +96,7 @@ def fit(self, X, y=None):
         ValueError
             If `threshold` is `None`.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES)
         if not self.threshold:
             raise ValueError("The `threshold` value cannot be `None`.")
 
@@ -157,7 +159,7 @@ def predict(self, X):
         array-like of shape (n_samples,)
             The predicted data. 1 for inliers, -1 for outliers.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES)
         check_is_fitted(self, ["pca_", "offset_"])
         result = np.ones(X.shape[0])
         result[self.difference(X) > self.threshold] = -1

diff --git a/sklego/decomposition/umap_reconstruction.py b/sklego/decomposition/umap_reconstruction.py
@@ -8,10 +8,12 @@
 
 import numpy as np
 from sklearn.base import BaseEstimator, OutlierMixin
-from sklearn.utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
+from sklearn.utils.validation import FLOAT_DTYPES, check_is_fitted
 
+from sklego.common import validate_data
 
-class UMAPOutlierDetection(BaseEstimator, OutlierMixin):
+
+class UMAPOutlierDetection(OutlierMixin, BaseEstimator):
     """`UMAPOutlierDetection` is an outlier detector based on the reconstruction error from UMAP.
 
     If the difference between original and reconstructed data is larger than the `threshold`, the point is
@@ -100,9 +102,9 @@ def fit(self, X, y=None):
             - If `n_components` is less than 2.
             - If `threshold` is `None`.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES)
         if y is not None:
-            y = check_array(y, estimator=self, ensure_2d=False)
+            y = validate_data(self, y, ensure_2d=False)
 
         if not self.threshold:
             raise ValueError("The `threshold` value cannot be `None`.")
@@ -133,6 +135,7 @@ def difference(self, X):
             The calculated difference.
         """
         check_is_fitted(self, ["umap_", "offset_"])
+
         reduced = self.umap_.transform(X)
         diff = np.sum(np.abs(self.umap_.inverse_transform(reduced) - X), axis=1)
         if self.variant == "relative":
@@ -155,7 +158,7 @@ def predict(self, X):
         array-like of shape (n_samples,)
             The predicted data. 1 for inliers, -1 for outliers.
         """
-        X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
+        X = validate_data(self, X, dtype=FLOAT_DTYPES)
         check_is_fitted(self, ["umap_", "offset_"])
         result = np.ones(X.shape[0])
         result[self.difference(X) > self.threshold] = -1
@@ -172,3 +175,13 @@ def score_samples(self, X):
 
     def _more_tags(self):
         return {"non_deterministic": True}
+
+    def __sklearn_tags__(self):
+        from sklego import SKLEARN_VERSION
+
+        if SKLEARN_VERSION >= (1, 6):
+            tags = super().__sklearn_tags__()
+            tags.non_deterministic = True
+            return tags
+        else:
+            pass
diff --git a/sklego/dummy.py b/sklego/dummy.py
@@ -11,7 +11,7 @@
 )
 
 
-class RandomRegressor(BaseEstimator, RegressorMixin):
+class RandomRegressor(RegressorMixin, BaseEstimator):
     """A `RandomRegressor` makes random predictions only based on the `y` value that is seen.
 
     The goal is that such a regressor can be used for benchmarking. It _should be_ easily beatable.
@@ -101,7 +101,7 @@ def predict(self, X):
 
         X = check_array(X, estimator=self, dtype=FLOAT_DTYPES)
         if X.shape[1] != self.n_features_in_:
-            raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.dim_}")
+            raise ValueError(f"Unexpected input dimension {X.shape[1]}, expected {self.n_features_in_}")
 
         if self.strategy == "normal":
             return rs.normal(self.mu_, self.sigma_, X.shape[0])
@@ -127,3 +127,14 @@ def allowed_strategies(self):
 
     def _more_tags(self):
         return {"poor_score": True, "non_deterministic": True}
+
+    def __sklearn_tags__(self):
+        from sklego import SKLEARN_VERSION
+
+        if SKLEARN_VERSION >= (1, 6):
+            tags = super().__sklearn_tags__()
+            tags.non_deterministic = True
+            tags.regressor_tags.poor_score = True
+            return tags
+        else:
+            pass
diff --git a/sklego/feature_selection/mrmr.py b/sklego/feature_selection/mrmr.py
@@ -4,7 +4,9 @@
 from sklearn.base import BaseEstimator
 from sklearn.feature_selection import f_classif, f_regression
 from sklearn.feature_selection._base import SelectorMixin
-from sklearn.utils.validation import check_is_fitted, check_X_y
+from sklearn.utils.validation import check_is_fitted
+
+from sklego.common import validate_data
 
 
 def _redundancy_pearson(X, selected, left):
@@ -201,7 +203,8 @@ def fit(self, X, y):
 
                 k parameter is not integer type or is < n_features_in (X.shape[1]) or < 1
         """
-        X, y = check_X_y(X, y, dtype="numeric", y_numeric=True)
+        X, y = validate_data(self, X, y, dtype="numeric", y_numeric=True)
+
         self._y_dtype = y.dtype
 
         relevance = self._get_relevance

diff --git a/sklego/linear_model.py b/sklego/linear_model.py
@@ -27,7 +27,7 @@
 )
 
 
-class LowessRegression(BaseEstimator, RegressorMixin):
+class LowessRegression(RegressorMixin, BaseEstimator):
     """`LowessRegression` estimator: LOWESS (Locally Weighted Scatterplot Smoothing) is a type of
     [local regression](https://en.wikipedia.org/wiki/Local_regression).
 
@@ -145,7 +145,7 @@ def predict(self, X):
         return results
 
 
-class ProbWeightRegression(BaseEstimator, RegressorMixin):
+class ProbWeightRegression(RegressorMixin, BaseEstimator):
     """`ProbWeightRegression` assumes that all input signals in `X` need to be reweighted with weights that sum up to
     one in order to predict `y`.
 
@@ -266,7 +266,7 @@ def coefs_(self):
         return self.coef_
 
 
-class DeadZoneRegressor(BaseEstimator, RegressorMixin):
+class DeadZoneRegressor(RegressorMixin, BaseEstimator):
     r"""The `DeadZoneRegressor` estimator implements a regression model that incorporates a _dead zone effect_ for
     improving the robustness of regression predictions.
 
@@ -470,7 +470,7 @@ def allowed_effects(self):
         return self._ALLOWED_EFFECTS
 
 
-class _FairClassifier(BaseEstimator, LinearClassifierMixin):
+class _FairClassifier(LinearClassifierMixin, BaseEstimator):
     """Base class for fair classifiers that address sensitive attribute fairness.
 
     This base class provides a foundation for fair classifiers that aim to mitigate bias and discrimination by taking
@@ -671,8 +671,18 @@ def decision_function(self, X):
     def _more_tags(self):
         return {"poor_score": True}
 
+    def __sklearn_tags__(self):
+        from sklego import SKLEARN_VERSION
 
-class DemographicParityClassifier(BaseEstimator, LinearClassifierMixin):
+        if SKLEARN_VERSION >= (1, 6):
+            tags = super().__sklearn_tags__()
+            tags.classifier_tags.poor_score = True
+            return tags
+        else:
+            pass
+
+
+class DemographicParityClassifier(LinearClassifierMixin, BaseEstimator):
     r"""`DemographicParityClassifier` is a logistic regression classifier which can be constrained on demographic
     parity (p% score).
 
@@ -790,7 +800,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs):
             return []
 
 
-class EqualOpportunityClassifier(BaseEstimator, LinearClassifierMixin):
+class EqualOpportunityClassifier(LinearClassifierMixin, BaseEstimator):
     r"""`EqualOpportunityClassifier` is a logistic regression classifier which can be constrained on equal opportunity
     score.
 
@@ -904,7 +914,7 @@ def constraints(self, y_hat, y_true, sensitive, n_obs):
             return []
 
 
-class BaseScipyMinimizeRegressor(BaseEstimator, RegressorMixin, ABC):
+class BaseScipyMinimizeRegressor(RegressorMixin, BaseEstimator, ABC):
     """Abstract base class for regressors relying on Scipy's
     [minimize method](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html) to minimize a
     (custom) loss function.
@@ -960,8 +970,6 @@ def __init__(
         self.fit_intercept = fit_intercept
         self.copy_X = copy_X
         self.positive = positive
-        if method not in ("SLSQP", "TNC", "L-BFGS-B"):
-            raise ValueError(f'method should be one of "SLSQP", "TNC", "L-BFGS-B", ' f"got {method} instead")
         self.method = method
 
     @abstractmethod
@@ -1011,6 +1019,9 @@ def fit(self, X, y, sample_weight=None):
         self : BaseScipyMinimizeRegressor
             Fitted linear model.
         """
+        if self.method not in {"SLSQP", "TNC", "L-BFGS-B"}:
+            msg = f"method should be one of 'SLSQP', 'TNC', 'L-BFGS-B', got {self.method} instead"
+            raise ValueError(msg)
         X_, grad_loss, loss = self._prepare_inputs(X, sample_weight, y)
 
         d = X_.shape[1] - self.n_features_in_  # This is either zero or one.

diff --git a/sklego/meta/confusion_balancer.py b/sklego/meta/confusion_balancer.py
@@ -7,7 +7,7 @@
 from sklego.base import ProbabilisticClassifier
 
 
-class ConfusionBalancer(BaseEstimator, MetaEstimatorMixin, ClassifierMixin):
+class ConfusionBalancer(ClassifierMixin, MetaEstimatorMixin, BaseEstimator):
     r"""The `ConfusionBalancer` estimator attempts to give it's child estimator a more balanced output by learning from
     the confusion matrix during training.
 

diff --git a/sklego/meta/decay_estimator.py b/sklego/meta/decay_estimator.py
@@ -5,7 +5,7 @@
 from sklego.meta._decay_utils import exponential_decay, linear_decay, sigmoid_decay, stepwise_decay
 
 
-class DecayEstimator(BaseEstimator, MetaEstimatorMixin):
+class DecayEstimator(MetaEstimatorMixin, BaseEstimator):
     """Morphs an estimator such that the training weights can be adapted to ensure that points that are far away have
     less weight.
 
@@ -97,10 +97,16 @@ def _is_classifier(self):
         """Checks if the wrapped estimator is a classifier."""
         return any(["ClassifierMixin" in p.__name__ for p in type(self.model).__bases__])
 
+    def _is_regressor(self):
+        """Checks if the wrapped estimator is a regressor."""
+        return any(["RegressorMixin" in p.__name__ for p in type(self.model).__bases__])
+
     @property
     def _estimator_type(self):
         """Computes `_estimator_type` dynamically from the wrapped model."""
-        return self.model._estimator_type
+        from sklego import SKLEARN_VERSION
+
+        return self.model.__sklearn_tags__().estimator_type if SKLEARN_VERSION >= (1, 6) else self.model._estimator_type
 
     def fit(self, X, y):
         """Fit the underlying estimator on the training data `X` and `y` using the calculated sample weights.
@@ -165,3 +171,6 @@ def predict(self, X):
     def score(self, X, y):
         """Alias for `.score()` method of the underlying estimator."""
         return self.estimator_.score(X, y)
+
+    def __sklearn_tags__(self):
+        return self.model.__sklearn_tags__()
diff --git a/sklego/meta/grouped_predictor.py b/sklego/meta/grouped_predictor.py
@@ -401,8 +401,18 @@ def _estimator_type(self):
     def _more_tags(self):
         return {"allow_nan": True}
 
+    def __sklearn_tags__(self):
+        from sklego import SKLEARN_VERSION
 
-class GroupedRegressor(GroupedPredictor, RegressorMixin):
+        if SKLEARN_VERSION >= (1, 6):
+            tags = super().__sklearn_tags__()
+            tags.input_tags.allow_nan = True
+            return tags
+        else:
+            pass
+
+
+class GroupedRegressor(RegressorMixin, GroupedPredictor):
     """`GroupedRegressor` is a meta-estimator that fits a separate regressor for each group in the input data.
 
     Its spec is the same as [`GroupedPredictor`][sklego.meta.grouped_predictor.GroupedPredictor] but it is available
@@ -439,7 +449,7 @@ def fit(self, X, y):
         return super().fit(X, y)
 
 
-class GroupedClassifier(GroupedPredictor, ClassifierMixin):
+class GroupedClassifier(ClassifierMixin, GroupedPredictor):
     """`GroupedClassifier` is a meta-estimator that fits a separate classifier for each group in the input data.
 
     Its equivalent to [`GroupedPredictor`][sklego.meta.grouped_predictor.GroupedPredictor] with `shrinkage=None`