Skip to content

Commit

Permalink
Introducing FormulaicTransformer and deprecating PatsyTransformer (
Browse files Browse the repository at this point in the history
…#593)

* relax patsy dependency

* deprecating patsy, adding formulaic

* formulaic tests

* formulaic pickling test

* patsy deprecation version change
  • Loading branch information
FBruzzesi authored Oct 27, 2023
1 parent 02e8154 commit ecc11aa
Show file tree
Hide file tree
Showing 6 changed files with 273 additions and 7 deletions.
7 changes: 5 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
base_packages = [
"scikit-learn>=1.0",
"pandas>=1.1.5",
"patsy>=0.5.1",
"Deprecated>=1.2.6",
"umap-learn>=0.4.6"
]
cvxpy_packages = ["cvxpy>=1.1.8"]
all_packages = cvxpy_packages
patsy_packages = ["patsy>=0.5.1"]
formulaic_packages = ["formulaic>=0.6.0"]
all_packages = cvxpy_packages + patsy_packages + formulaic_packages

docs_packages = [
"sphinx==4.5.0",
Expand Down Expand Up @@ -55,6 +56,8 @@ def read(fname):
extras_require={
"base": base_packages,
"cvxpy": cvxpy_packages,
"patsy": patsy_packages,
"formulaic": formulaic_packages,
"all": all_packages,
"docs": docs_packages,
"dev": dev_packages,
Expand Down
6 changes: 5 additions & 1 deletion sklego/notinstalled.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
KNOWN_PACKAGES = {"cvxpy": {"version": ">=1.0.24", "extra_name": "cvxpy"}}
KNOWN_PACKAGES = {
"cvxpy": {"version": ">=1.0.24", "extra_name": "cvxpy"},
"formulaic": {"version": ">=0.6.0", "extra_name": "formulaic"},
"patsy": {"version": ">=0.5.1", "extra_name": "patsy"},
}


class NotInstalledPackage:
Expand Down
2 changes: 2 additions & 0 deletions sklego/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"IdentityTransformer",
"OutlierRemover",
"DictMapper",
"FormulaicTransformer",
]

from .intervalencoder import IntervalEncoder
Expand All @@ -24,3 +25,4 @@
from .identitytransformer import IdentityTransformer
from .outlier_remover import OutlierRemover
from .dictmapper import DictMapper
from .formulaictransformer import FormulaicTransformer
104 changes: 104 additions & 0 deletions sklego/preprocessing/formulaictransformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
try:
import formulaic
except ImportError:
from sklego.notinstalled import NotInstalledPackage

formulaic = NotInstalledPackage("formulaic")

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted


class FormulaicTransformer(TransformerMixin, BaseEstimator):
"""The `FormulaicTransformer` offers a method to select the right columns from a dataframe as well as a DSL for
transformations.
It is inspired from R formulas. This is can be useful as a first step in the pipeline.
Parameters
----------
formula : str
A formulaic-compatible formula.
Refer to the [formulaic documentation](https://matthewwardrop.github.io/formulaic/guides/grammar/) for more details.
return_type : Literal["pandas", "numpy", "sparse"], default="numpy"
The type of the returned matrix.
Refer to the [formulaic documentation](https://matthewwardrop.github.io/formulaic/guides/model_specs/) for more details.
Attributes
----------
formula_ : formulaic.Formula
The parsed formula specification.
model_spec_ : formulaic.ModelSpec
The parsed model specification.
n_features_in_ : int
Number of features seen during `fit`.
"""

def __init__(self, formula, return_type="numpy"):
self.formula = formula
self.return_type = return_type

def fit(self, X, y=None):
"""Fit the `FormulaicTransformer` to the data by compiling the formula specification into a model spec.
Parameters
----------
X : pd.DataFrame of (n_samples, n_features)
The data used to compile model spec.
y : array-like of shape (n_samples,), default=None
Ignored, present for compatibility.
Returns
-------
self : FormulaicTransformer
The fitted transformer.
Raises
------
ValueError
If `formula` is not supported.
"""
self.formula_ = formulaic.Formula.from_spec(self.formula)

if self.formula_._has_structure:
raise ValueError(
f"Formula specification {repr(self.formula_)} results in a structured formula, which is not supported."
)

self.model_spec_ = self.formula_.get_model_matrix(
X, output=self.return_type
).model_spec
self.n_features_in_ = X.shape[1]
return self

def transform(self, X, y=None):
"""Transform `X` by generating a model matrix from it based on the fit model spec.
Parameters
----------
X : pd.DataFrame of shape (n_samples, n_features)
The data for transformation will be applied.
y: array-like of shape (n_samples,), default=None
Ignored, present for compatibility.
Returns
-------
X : array-like of shape (n_samples, n_features), and type `return_type`
Transformed data.
Raises
------
ValueError
If the number of columns from `X` differs from the number of columns when fitting.
"""

check_is_fitted(self, ["formula_", "model_spec_", "n_features_in_"])

if X.shape[1] != self.n_features_in_:
raise ValueError(
"`X` must have the same number of columns in fit and transform. "
f"Expected {self.n_features_in_}, found {X.shape[1]}."
)

X_ = self.model_spec_.get_model_matrix(X)
return X_
22 changes: 18 additions & 4 deletions sklego/preprocessing/patsytransformer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,21 @@
try:
import patsy
except ImportError:
from sklego.notinstalled import NotInstalledPackage

patsy = NotInstalledPackage("patsy")

import numpy as np
from patsy import dmatrix, build_design_matrices, PatsyError
from deprecated import deprecated
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted


@deprecated(
version="0.6.17",
reason="Please use `sklego.preprocessing.FormulaicTransformer` instead. "
"This object will be removed from the preprocessing submodule in version 0.8.0.",
)
class PatsyTransformer(TransformerMixin, BaseEstimator):
"""
The patsy transformer offers a method to select the right columns
Expand All @@ -20,7 +32,9 @@ def __init__(self, formula, return_type="matrix"):

def fit(self, X, y=None):
"""Fits the estimator"""
X_ = dmatrix(self.formula, X, NA_action="raise", return_type=self.return_type)
X_ = patsy.dmatrix(
self.formula, X, NA_action="raise", return_type=self.return_type
)

# check the number of observations hasn't changed. This ought not to
# be necessary given NA_action='raise' above but just to be safe
Expand All @@ -43,8 +57,8 @@ def transform(self, X):
"""
check_is_fitted(self, "design_info_")
try:
return build_design_matrices(
return patsy.build_design_matrices(
[self.design_info_], X, return_type=self.return_type
)[0]
except PatsyError as e:
except patsy.PatsyError as e:
raise RuntimeError from e
139 changes: 139 additions & 0 deletions tests/test_preprocessing/test_formulaic_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import pytest
import joblib
import numpy as np
import pandas as pd
from scipy.sparse import spmatrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

from sklego.preprocessing import FormulaicTransformer


@pytest.fixture()
def df():
return pd.DataFrame(
{
"a": [1, 2, 3, 4, 5, 6],
"b": np.log([10, 9, 8, 7, 6, 5]),
"c": ["a", "b", "a", "b", "c", "c"],
"d": ["b", "a", "a", "b", "a", "b"],
"e": [0, 1, 0, 1, 0, 1],
}
)

@pytest.mark.parametrize(
"return_type, expected_type",
[
("numpy", np.ndarray),
("pandas", pd.DataFrame),
("sparse", spmatrix),
],
)
def test_return_type(df, return_type, expected_type):
X, y = df[["a", "b", "c", "d"]], df[["e"]]
tf = FormulaicTransformer("a + b - 1", return_type=return_type)
df_fit_transformed = tf.fit(X, y).transform(X)
assert isinstance(df_fit_transformed, expected_type)



@pytest.mark.parametrize(
"formula, expected_shape",
[
("a + b - 1", (6, 2)),
("a + np.log(a) + b - 1", (6, 3)),
("a*b - 1", (6, 3)),
("a + b + d", (6,4)),
("a + b + c + d", (6,6)),
],
)
def test_formula_output(df, formula, expected_shape):
X, y = df[["a", "b", "c", "d"]], df[["e"]]
tf = FormulaicTransformer(formula=formula)

assert tf.fit(X, y).transform(X).shape == expected_shape



def test_pipeline(df):
X, y = df[["a", "b", "c", "d"]], df[["e"]].values.ravel()

pipe = Pipeline(
[
("design", FormulaicTransformer("a + np.log(a) + b - 1")),
("scale", StandardScaler()),
("model", LogisticRegression(solver="lbfgs")),
]
)
assert pipe.fit(X, y).predict(X).shape[0] == X.shape[0]


def test_unseen_categories(df):
df_train, df_test = df[:4], df[4:]

X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel()
X_test = df_test[["a", "b", "c", "d"]]

trf = FormulaicTransformer("a + np.log(a) + b + c + d - 1")
_ = trf.fit(X_train, y_train)

assert trf.transform(X_test).shape[1] == trf.transform(X_train).shape[1]

pipe = Pipeline(
[
("design", FormulaicTransformer("a + np.log(a) + b + c + d - 1")),
("scale", StandardScaler()),
("model", LogisticRegression(solver="lbfgs")),
]
)

_ = pipe.fit(X_train, y_train)
assert pipe.predict(X_test).shape[0] == X_test.shape[0]

def test_misshape(df):
df_train, df_test = df[:4], df[4:]

X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel()
X_test = df_test[["a", "b", "c"]]

trf = FormulaicTransformer("a + np.log(a) + b + c + d - 1")
_ = trf.fit(X_train, y_train)

with pytest.raises(ValueError):
trf.transform(X_test)


@pytest.mark.parametrize(
"return_type", ("numpy", "pandas")
)
@pytest.mark.parametrize(
"formula", (
"a + b - 1",
"a + np.log(a) + b - 1",
"a*b - 1",
"a + b + d",
"a + b + c + d",
)
)
def test_pickling(tmp_path, df, return_type, formula):

df_train, df_test = df[:4], df[4:]

X_train, y_train = df_train[["a", "b", "c", "d"]], df_train[["e"]].values.ravel()
X_test = df_test[["a", "b", "c", "d"]]

pipe = Pipeline(
[
("design", FormulaicTransformer(formula=formula, return_type=return_type)),
("scale", StandardScaler()),
("model", LogisticRegression(solver="lbfgs")),
]
)

_ = pipe.fit(X_train, y_train)

joblib.dump(pipe, tmp_path/"pipeline.pkl")
loaded_pipe = joblib.load(tmp_path/"pipeline.pkl")

assert loaded_pipe.predict(X_test).shape[0] == X_test.shape[0]

0 comments on commit ecc11aa

Please sign in to comment.