Introducing FormulaicTransformer and deprecating PatsyTransformer (…

…#593) * relax patsy dependency * deprecating patsy, adding formulaic * formulaic tests * formulaic pickling test * patsy deprecation version change
koaning · Oct 27, 2023 · ecc11aa · ecc11aa
1 parent 02e8154
commit ecc11aa
Show file tree

Hide file tree

Showing 6 changed files with 273 additions and 7 deletions.
diff --git a/setup.py b/setup.py
@@ -6,12 +6,13 @@
 base_packages = [
     "scikit-learn>=1.0",
     "pandas>=1.1.5",
-    "patsy>=0.5.1",
     "Deprecated>=1.2.6",
     "umap-learn>=0.4.6"
 ]
 cvxpy_packages = ["cvxpy>=1.1.8"]
-all_packages = cvxpy_packages
+patsy_packages = ["patsy>=0.5.1"]
+formulaic_packages = ["formulaic>=0.6.0"]
+all_packages = cvxpy_packages + patsy_packages + formulaic_packages
 
 docs_packages = [
     "sphinx==4.5.0",
@@ -55,6 +56,8 @@ def read(fname):
     extras_require={
         "base": base_packages,
         "cvxpy": cvxpy_packages,
+        "patsy": patsy_packages,
+        "formulaic": formulaic_packages,
         "all": all_packages,
         "docs": docs_packages,
         "dev": dev_packages,

diff --git a/sklego/notinstalled.py b/sklego/notinstalled.py
@@ -1,4 +1,8 @@
-KNOWN_PACKAGES = {"cvxpy": {"version": ">=1.0.24", "extra_name": "cvxpy"}}
+KNOWN_PACKAGES = {
+    "cvxpy": {"version": ">=1.0.24", "extra_name": "cvxpy"},
+    "formulaic": {"version": ">=0.6.0", "extra_name": "formulaic"},
+    "patsy": {"version": ">=0.5.1", "extra_name": "patsy"},
+}
 
 
 class NotInstalledPackage:

diff --git a/sklego/preprocessing/__init__.py b/sklego/preprocessing/__init__.py
@@ -12,6 +12,7 @@
     "IdentityTransformer",
     "OutlierRemover",
     "DictMapper",
+    "FormulaicTransformer",
 ]
 
 from .intervalencoder import IntervalEncoder
@@ -24,3 +25,4 @@
 from .identitytransformer import IdentityTransformer
 from .outlier_remover import OutlierRemover
 from .dictmapper import DictMapper
+from .formulaictransformer import FormulaicTransformer
diff --git a/sklego/preprocessing/formulaictransformer.py b/sklego/preprocessing/formulaictransformer.py
@@ -0,0 +1,104 @@
+try:
+    import formulaic
+except ImportError:
+    from sklego.notinstalled import NotInstalledPackage
+
+    formulaic = NotInstalledPackage("formulaic")
+
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+
+class FormulaicTransformer(TransformerMixin, BaseEstimator):
+    """The `FormulaicTransformer` offers a method to select the right columns from a dataframe as well as a DSL for
+    transformations.
+
+    It is inspired from R formulas. This is can be useful as a first step in the pipeline.
+
+    Parameters
+    ----------
+    formula : str
+        A formulaic-compatible formula.
+        Refer to the [formulaic documentation](https://matthewwardrop.github.io/formulaic/guides/grammar/) for more details.
+    return_type : Literal["pandas", "numpy", "sparse"], default="numpy"
+        The type of the returned matrix.
+        Refer to the [formulaic documentation](https://matthewwardrop.github.io/formulaic/guides/model_specs/) for more details.
+
+    Attributes
+    ----------
+    formula_ : formulaic.Formula
+        The parsed formula specification.
+    model_spec_ : formulaic.ModelSpec
+        The parsed model specification.
+    n_features_in_ : int
+        Number of features seen during `fit`.
+    """
+
+    def __init__(self, formula, return_type="numpy"):
+        self.formula = formula
+        self.return_type = return_type
+
+    def fit(self, X, y=None):
+        """Fit the `FormulaicTransformer` to the data by compiling the formula specification into a model spec.
+
+        Parameters
+        ----------
+        X : pd.DataFrame of (n_samples, n_features)
+            The data used to compile model spec.
+        y : array-like of shape (n_samples,), default=None
+            Ignored, present for compatibility.
+
+        Returns
+        -------
+        self : FormulaicTransformer
+            The fitted transformer.
+
+        Raises
+        ------
+        ValueError
+            If `formula` is not supported.
+        """
+        self.formula_ = formulaic.Formula.from_spec(self.formula)
+
+        if self.formula_._has_structure:
+            raise ValueError(
+                f"Formula specification {repr(self.formula_)} results in a structured formula, which is not supported."
+            )
+
+        self.model_spec_ = self.formula_.get_model_matrix(
+            X, output=self.return_type
+        ).model_spec
+        self.n_features_in_ = X.shape[1]
+        return self
+
+    def transform(self, X, y=None):
+        """Transform `X` by generating a model matrix from it based on the fit model spec.
+
+        Parameters
+        ----------
+        X : pd.DataFrame of shape (n_samples, n_features)
+            The data for transformation will be applied.
+        y: array-like of shape (n_samples,), default=None
+            Ignored, present for compatibility.
+
+        Returns
+        -------
+        X : array-like of shape (n_samples, n_features), and type `return_type`
+            Transformed data.
+
+        Raises
+        ------
+        ValueError
+            If the number of columns from `X` differs from the number of columns when fitting.
+        """
+
+        check_is_fitted(self, ["formula_", "model_spec_", "n_features_in_"])
+
+        if X.shape[1] != self.n_features_in_:
+            raise ValueError(
+                "`X` must have the same number of columns in fit and transform. "
+                f"Expected {self.n_features_in_}, found {X.shape[1]}."
+            )
+
+        X_ = self.model_spec_.get_model_matrix(X)
+        return X_
diff --git a/sklego/preprocessing/patsytransformer.py b/sklego/preprocessing/patsytransformer.py
@@ -1,9 +1,21 @@
+try:
+    import patsy
+except ImportError:
+    from sklego.notinstalled import NotInstalledPackage
+
+    patsy = NotInstalledPackage("patsy")
+
 import numpy as np
-from patsy import dmatrix, build_design_matrices, PatsyError
+from deprecated import deprecated
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils.validation import check_is_fitted
 
 
+@deprecated(
+    version="0.6.17",
+    reason="Please use `sklego.preprocessing.FormulaicTransformer` instead. "
+    "This object will be removed from the preprocessing submodule in version 0.8.0.",
+)
 class PatsyTransformer(TransformerMixin, BaseEstimator):
     """
     The patsy transformer offers a method to select the right columns
@@ -20,7 +32,9 @@ def __init__(self, formula, return_type="matrix"):
 
     def fit(self, X, y=None):
         """Fits the estimator"""
-        X_ = dmatrix(self.formula, X, NA_action="raise", return_type=self.return_type)
+        X_ = patsy.dmatrix(
+            self.formula, X, NA_action="raise", return_type=self.return_type
+        )
 
         # check the number of observations hasn't changed. This ought not to
         # be necessary given NA_action='raise' above but just to be safe
@@ -43,8 +57,8 @@ def transform(self, X):
         """
         check_is_fitted(self, "design_info_")
         try:
-            return build_design_matrices(
+            return patsy.build_design_matrices(
                 [self.design_info_], X, return_type=self.return_type
             )[0]
-        except PatsyError as e:
+        except patsy.PatsyError as e:
             raise RuntimeError from e
diff --git a/tests/test_preprocessing/test_formulaic_transformer.py b/tests/test_preprocessing/test_formulaic_transformer.py
@@ -0,0 +1,139 @@
+import pytest
+import joblib
+import numpy as np
+import pandas as pd
+from scipy.sparse import spmatrix
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import LogisticRegression
+
+from sklego.preprocessing import FormulaicTransformer
+
+
+@pytest.fixture()
+def df():
+    return pd.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6],
+            "b": np.log([10, 9, 8, 7, 6, 5]),
+            "c": ["a", "b", "a", "b", "c", "c"],
+            "d": ["b", "a", "a", "b", "a", "b"],
+            "e": [0, 1, 0, 1, 0, 1],
+        }
+    )
+
+@pytest.mark.parametrize(
+        "return_type, expected_type",
+        [
+            ("numpy", np.ndarray),
+            ("pandas", pd.DataFrame),
+            ("sparse", spmatrix),
+        ],
+)
+def test_return_type(df, return_type, expected_type):
+    X, y = df[["a", "b", "c", "d"]], df[["e"]]
+    tf = FormulaicTransformer("a + b - 1", return_type=return_type)
+    df_fit_transformed = tf.fit(X, y).transform(X)
+    assert isinstance(df_fit_transformed, expected_type)
+
+
+
+@pytest.mark.parametrize(
+        "formula, expected_shape",
+        [
+            ("a + b - 1", (6, 2)),
+            ("a + np.log(a) + b - 1", (6, 3)),
+            ("a*b - 1", (6, 3)),
+            ("a + b + d", (6,4)),
+            ("a + b + c + d", (6,6)),
+        ],
+)
+def test_formula_output(df, formula, expected_shape):
+    X, y = df[["a", "b", "c", "d"]], df[["e"]]
+    tf = FormulaicTransformer(formula=formula)
+
+    assert tf.fit(X, y).transform(X).shape == expected_shape
+
+
+
+def test_pipeline(df):
+    X, y = df[["a", "b", "c", "d"]], df[["e"]].values.ravel()
+
+    pipe = Pipeline(
+        [
+            ("design", FormulaicTransformer("a + np.log(a) + b - 1")),
+            ("scale", StandardScaler()),
+            ("model", LogisticRegression(solver="lbfgs")),
+        ]
+    )
+    assert pipe.fit(X, y).predict(X).shape[0] == X.shape[0]
+
+
+def test_unseen_categories(df):
+    df_train, df_test = df[:4], df[4:]
+
+    X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel()
+    X_test = df_test[["a", "b", "c", "d"]]
+
+    trf = FormulaicTransformer("a + np.log(a) + b + c + d - 1")
+    _ = trf.fit(X_train, y_train)
+
+    assert trf.transform(X_test).shape[1] == trf.transform(X_train).shape[1]
+
+    pipe = Pipeline(
+        [
+            ("design", FormulaicTransformer("a + np.log(a) + b + c + d - 1")),
+            ("scale", StandardScaler()),
+            ("model", LogisticRegression(solver="lbfgs")),
+        ]
+    )
+
+    _ = pipe.fit(X_train, y_train)
+    assert pipe.predict(X_test).shape[0] == X_test.shape[0]
+
+def test_misshape(df):
+    df_train, df_test = df[:4], df[4:]
+
+    X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel()
+    X_test = df_test[["a", "b", "c"]]
+
+    trf = FormulaicTransformer("a + np.log(a) + b + c + d - 1")
+    _ = trf.fit(X_train, y_train)
+
+    with pytest.raises(ValueError):
+        trf.transform(X_test)
+
+
+@pytest.mark.parametrize(
+    "return_type", ("numpy", "pandas")
+)
+@pytest.mark.parametrize(
+    "formula", (
+        "a + b - 1",
+        "a + np.log(a) + b - 1",
+        "a*b - 1",
+        "a + b + d",
+        "a + b + c + d",
+    )
+)
+def test_pickling(tmp_path, df, return_type, formula):
+
+    df_train, df_test = df[:4], df[4:]
+
+    X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel()
+    X_test = df_test[["a", "b", "c", "d"]]
+
+    pipe = Pipeline(
+        [
+            ("design", FormulaicTransformer(formula=formula, return_type=return_type)),
+            ("scale", StandardScaler()),
+            ("model", LogisticRegression(solver="lbfgs")),
+        ]
+    )
+
+    _ = pipe.fit(X_train, y_train)
+
+    joblib.dump(pipe, tmp_path/"pipeline.pkl")
+    loaded_pipe = joblib.load(tmp_path/"pipeline.pkl")
+
+    assert loaded_pipe.predict(X_test).shape[0] == X_test.shape[0]