diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9409cf61..4db40ad0 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -25,6 +25,9 @@ Changed - Refactored BaseImputer to utilise narwhals `#314 _` - Converted test dfs to flexible pandas/polars setup - Converted BaseNominalTransformer to utilise narwhals `#334 _` +- narwhalified CheckNumericMixin `#336 _` +- placeholder +- placeholder - placeholder - placeholder - placeholder diff --git a/tests/mixins/test_CheckNumericMixin.py b/tests/mixins/test_CheckNumericMixin.py new file mode 100644 index 00000000..9f3a00cb --- /dev/null +++ b/tests/mixins/test_CheckNumericMixin.py @@ -0,0 +1,64 @@ +import re + +import pytest + +from tests.test_data import ( + create_bool_and_float_df, + create_df_2, + create_df_with_none_and_nan_cols, + create_is_between_dates_df_1, +) +from tubular.mixins import CheckNumericMixin + + +class TestCheckNumericMixin: + "tests for CheckNumericMixin class" + + @pytest.mark.parametrize("library", ["pandas", "polars"]) + @pytest.mark.parametrize( + ("df_generator", "bad_cols"), + [ + (create_df_2, ["b", "c"]), # str + (create_is_between_dates_df_1, ["a"]), # datetime + (create_bool_and_float_df, ["b"]), # bool + (create_df_with_none_and_nan_cols, ["b"]), # None + ], + ) + def test_check_numeric_columns_errors(self, library, df_generator, bad_cols): + "test check_numeric_columns method raises appropriate error" + + df = df_generator(library=library) + + obj = CheckNumericMixin() + + # this object is generally wrapped in a transformer with a .columns attr, set this here + obj.columns = bad_cols + + with pytest.raises( + TypeError, + match=re.escape( + f"CheckNumericMixin: The following columns are not numeric in X; {bad_cols}", + ), + ): + obj.check_numeric_columns(df) + + @pytest.mark.parametrize("library", ["pandas", "polars"]) + @pytest.mark.parametrize( + ("df_generator", "cols"), + [ + (create_df_2, ["a"]), # int + (create_bool_and_float_df, ["a"]), # float + (create_df_with_none_and_nan_cols, ["a"]), # nan + ], + ) + def test_check_numeric_columns_passes(self, library, df_generator, cols): + "test check_numeric_columns method passes for numeric columns" + + df = df_generator(library=library) + + obj = CheckNumericMixin() + + # this object is generally wrapped in a transformer with a .columns attr, set this here + obj.columns = cols + + obj.check_numeric_columns(df) diff --git a/tests/numeric/test_BaseNumericTransformer.py b/tests/numeric/test_BaseNumericTransformer.py index ef6c8d73..72f2c3f9 100644 --- a/tests/numeric/test_BaseNumericTransformer.py +++ b/tests/numeric/test_BaseNumericTransformer.py @@ -1,5 +1,6 @@ import re +import pandas as pd import pytest import tests.test_data as d @@ -19,18 +20,56 @@ class BaseNumericTransformerFitTests(GenericFitTests): Note this deliberately avoids starting with "Tests" so that the tests are not run on import. """ - def test_non_numeric_exception_raised(self, initialized_transformers): + @pytest.mark.parametrize( + ("df_generator", "bad_cols"), + [ + (d.create_df_2, ["b"]), # str + (d.create_is_between_dates_df_1, ["a"]), # datetime + (d.create_bool_and_float_df, ["b"]), # bool + (d.create_df_with_none_and_nan_cols, ["b"]), # None + ], + ) + def test_non_numeric_exception_raised( + self, + initialized_transformers, + df_generator, + bad_cols, + ): """Test an exception is raised if self.columns are non-numeric in X.""" - df = d.create_df_2() + df = df_generator() + # add in 'target column' for fit + df["c"] = [1] * len(df) x = initialized_transformers[self.transformer_name] + x.columns = bad_cols with pytest.raises( TypeError, - match=rf"{self.transformer_name}: The following columns are not numeric in X; \['b'\]", + match=re.escape( + f"{self.transformer_name}: The following columns are not numeric in X; {bad_cols}", + ), ): x.fit(df, df["c"]) + @pytest.mark.parametrize( + ("df_generator", "cols"), + [ + (d.create_df_2, ["a"]), # int + (d.create_bool_and_float_df, ["a"]), # float + (d.create_df_with_none_and_nan_cols, ["a"]), # nan + ], + ) + def test_numeric_passes(self, initialized_transformers, df_generator, cols): + """Test check passes if self.columns numeric in X.""" + df = df_generator() + # add in 'target column' for fit + df["c"] = [1] * len(df) + + x = initialized_transformers[self.transformer_name] + x.columns = cols + + x.fit(df, df["c"]) + class BaseNumericTransformerTransformTests( GenericTransformTests, @@ -40,13 +79,34 @@ class BaseNumericTransformerTransformTests( Note this deliberately avoids starting with "Tests" so that the tests are not run on import. """ - def test_non_numeric_exception_raised(self, initialized_transformers): + @pytest.mark.parametrize( + ("df_generator", "bad_cols"), + [ + (d.create_df_2, ["b"]), # str + (d.create_is_between_dates_df_1, ["a"]), # datetime + (d.create_bool_and_float_df, ["b"]), # bool + (d.create_df_with_none_and_nan_cols, ["b"]), # None + ], + ) + def test_non_numeric_exception_raised( + self, + initialized_transformers, + df_generator, + bad_cols, + ): """Test an exception is raised if self.columns are non-numeric in X.""" - df = d.create_df_2() - # make df all non-numeric - df["a"] = df["b"] + df = df_generator() + # add in 'target column' for and additional numeric column fit + df["c"] = [1] * len(df) x = initialized_transformers[self.transformer_name] + x.columns = bad_cols + + # if the transformer fits, run a working fit before transform + if x.FITS: + # create numeric df to fit on + numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]}) + x.fit(numeric_df, numeric_df["c"]) with pytest.raises( TypeError, @@ -56,6 +116,31 @@ def test_non_numeric_exception_raised(self, initialized_transformers): ): x.transform(df) + @pytest.mark.parametrize( + ("df_generator"), + [ + d.create_df_2, # int + d.create_bool_and_float_df, # float + d.create_df_with_none_and_nan_cols, # nan + ], + ) + def test_numeric_passes(self, initialized_transformers, df_generator): + """Test check passes if self.columns numeric in X.""" + df = df_generator() + # add in 'target column' for and additional numeric column fit + df["c"] = [1] * len(df) + df["b"] = [1] * len(df) + + x = initialized_transformers[self.transformer_name] + x.columns = ["a", "b"] + + if x.FITS: + # create numeric df to fit on + numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]}) + x.fit(numeric_df, numeric_df["c"]) + + x.transform(df) + class TestInit(BaseNumericTransformerInitTests): """Tests for BaseNumericTransformer.init()""" diff --git a/tests/numeric/test_CutTransformer.py b/tests/numeric/test_CutTransformer.py index fca7d126..5c2c5855 100644 --- a/tests/numeric/test_CutTransformer.py +++ b/tests/numeric/test_CutTransformer.py @@ -65,6 +65,29 @@ def expected_df_1(): return df + @pytest.mark.parametrize( + ("df_generator"), + [ + d.create_df_2, # int + d.create_bool_and_float_df, # float + ], + ) + def test_numeric_passes(self, initialized_transformers, df_generator): + """Test check passes if self.columns numeric in X - this transformer does not work on all null column + so overload this test""" + df = df_generator() + # add in 'target column' for and additional numeric column fit + df["c"] = [1] * len(df) + df["b"] = [1] * len(df) + + x = initialized_transformers[self.transformer_name] + x.columns = ["a", "b"] + + numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]}) + x.fit(numeric_df, numeric_df["c"]) + + x.transform(df) + def test_output_from_cut_assigned_to_column(self, mocker): """Test that the output from pd.cut is assigned to column with name new_column_name.""" df = d.create_df_9() diff --git a/tests/numeric/test_TwoColumnOperatorTransformer.py b/tests/numeric/test_TwoColumnOperatorTransformer.py index c731941d..afec10a6 100644 --- a/tests/numeric/test_TwoColumnOperatorTransformer.py +++ b/tests/numeric/test_TwoColumnOperatorTransformer.py @@ -3,11 +3,13 @@ import tests.test_data as d from tests.base_tests import ( - GenericTransformTests, NewColumnNameInitMixintests, OtherBaseBehaviourTests, TwoColumnListInitTests, ) +from tests.numeric.test_BaseNumericTransformer import ( + BaseNumericTransformerTransformTests, +) from tubular.numeric import TwoColumnOperatorTransformer @@ -40,7 +42,7 @@ def test_axis_not_valid_error(self): ) -class TestTransform(GenericTransformTests): +class TestTransform(BaseNumericTransformerTransformTests): """Tests for transformer.transform.""" @classmethod @@ -74,19 +76,6 @@ def test_expected_output(self, pd_method_name, output): msg_tag="TwoColumnMethod transformer does not produce the expected output", ) - def test_non_numeric_error(self): - x = TwoColumnOperatorTransformer( - "mul", - ["a", "b"], - "c", - ) - - with pytest.raises( - TypeError, - match="TwoColumnOperatorTransformer: input columns in X must contain only numeric values", - ): - x.transform(d.create_df_8()) - class TestOtherBaseBehaviour(OtherBaseBehaviourTests): """ diff --git a/tests/test_data.py b/tests/test_data.py index 2c314dc7..2ac13e8c 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -3,6 +3,7 @@ import datetime import narwhals as nw +import numpy as np import pandas as pd import polars as pl @@ -211,6 +212,28 @@ def create_df_11(library="pandas"): return u.dataframe_init_dispatch(df_dict, library=library) +def create_bool_and_float_df(library="pandas"): + """Create simple DataFrame to use in other tests.""" + + df_dict = { + "a": [1.0, 2.0, np.nan], + "b": [True, False, None], + } + + return u.dataframe_init_dispatch(df_dict, library=library) + + +def create_df_with_none_and_nan_cols(library="pandas"): + """Create simple DataFrame to use in other tests.""" + + df_dict = { + "a": [np.nan, np.nan, np.nan], + "b": [None, None, None], + } + + return u.dataframe_init_dispatch(df_dict, library=library) + + def create_weighted_imputers_test_df(library="pandas"): """Create DataFrame to use imputer tests that correct values are imputed for weighted dataframes. diff --git a/tubular/mixins.py b/tubular/mixins.py index 4f578ddb..727a696b 100644 --- a/tubular/mixins.py +++ b/tubular/mixins.py @@ -5,31 +5,38 @@ import narwhals as nw import narwhals.selectors as ncs import numpy as np -import pandas as pd if TYPE_CHECKING: - from narwhals.typing import FrameT + import pandas as pd + from narhwals.typing import FrameT class CheckNumericMixin: - def check_numeric_columns(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Mixin class with methods for numeric transformers + + """ + + def classname(self) -> str: + """Method that returns the name of the current class when called.""" + + return type(self).__name__ + + @nw.narwhalify + def check_numeric_columns(self, X: FrameT) -> FrameT: """Helper function for checking column args are numeric for numeric transformers. Args: ---- - X (pd.DataFrame): Data containing columns to check. + X: Data containing columns to check. """ - numeric_column_types = X[self.columns].apply( - pd.api.types.is_numeric_dtype, - axis=0, + non_numeric_columns = list( + set(self.columns).difference(set(X.select(ncs.numeric()).columns)), ) - - if not numeric_column_types.all(): - non_numeric_columns = list( - numeric_column_types.loc[~numeric_column_types].index, - ) - + # sort as set ordering can be inconsistent + non_numeric_columns.sort() + if len(non_numeric_columns) > 0: msg = f"{self.classname()}: The following columns are not numeric in X; {non_numeric_columns}" raise TypeError(msg) @@ -138,7 +145,7 @@ class WeightColumnMixin: class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework """ - polars_compatible = False + polars_compatible = True @nw.narwhalify def check_weights_column(self, X: FrameT, weights_column: str) -> None: @@ -161,22 +168,22 @@ def check_weights_column(self, X: FrameT, weights_column: str) -> None: raise ValueError(msg) # check weight is positive - if X[weights_column].min() < 0: + if X.select(nw.col(weights_column).min()).item() < 0: msg = f"{self.classname()}: weight column must be positive" raise ValueError(msg) # check weight non-null - if X[weights_column].is_null().sum() != 0: + if X.select(nw.col(weights_column).is_null().sum()).item() != 0: msg = f"{self.classname()}: weight column must be non-null" raise ValueError(msg) - # check weight not inf + # check weight not inf, not currently a narwhals efficient way to do this if np.isinf(X[weights_column].to_numpy()).any(): msg = f"{self.classname()}: weight column must not contain infinite values." raise ValueError(msg) # check weight not all 0 - if X[weights_column].sum() == 0: + if X.select(nw.col(weights_column).sum()).item() == 0: msg = f"{self.classname()}: total sample weights are not greater than 0" raise ValueError(msg) diff --git a/tubular/numeric.py b/tubular/numeric.py index 1713fe2d..e93f77f9 100644 --- a/tubular/numeric.py +++ b/tubular/numeric.py @@ -21,7 +21,7 @@ ) -class BaseNumericTransformer(BaseTransformer): +class BaseNumericTransformer(BaseTransformer, CheckNumericMixin): """ Extends BaseTransformer for datetime scenarios. @@ -40,40 +40,18 @@ class BaseNumericTransformer(BaseTransformer): polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework + FITS: bool + class attribute, indicates whether transform requires fit to be run first """ polars_compatible = False + FITS = False + def __init__(self, columns: list[str], **kwargs: dict[str, bool]) -> None: super().__init__(columns=columns, **kwargs) - def _check_numeric(self, X: pd.DataFrame) -> None: - """Raise a type error if a column to be operated on is not numeric - - Parameters - ---------- - - X: pd.DataFrame - Data to validate - - """ - - numeric_column_types = X[self.columns].apply( - pd.api.types.is_numeric_dtype, - axis=0, - ) - - if not numeric_column_types.all(): - non_numeric_columns = list( - numeric_column_types.loc[~numeric_column_types].index, - ) - - msg = f"{self.classname()}: The following columns are not numeric in X; {non_numeric_columns}" - raise TypeError(msg) - - return X - def fit( self, X: pd.DataFrame, @@ -93,7 +71,7 @@ def fit( super().fit(X, y) - self._check_numeric(X) + CheckNumericMixin.check_numeric_columns(self, X) return self @@ -114,7 +92,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = super().transform(X) - self._check_numeric(X) + CheckNumericMixin.check_numeric_columns(self, X) return X @@ -161,11 +139,15 @@ class LogTransformer(BaseNumericTransformer, DropOriginalMixin): polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework + FITS: bool + class attribute, indicates whether transform requires fit to be run first """ polars_compatible = False + FITS = False + def __init__( self, columns: str | list[str] | None, @@ -273,11 +255,15 @@ class CutTransformer(BaseNumericTransformer): polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework + FITS: bool + class attribute, indicates whether transform requires fit to be run first """ polars_compatible = False + FITS = False + def __init__( self, column: str, @@ -337,6 +323,7 @@ class TwoColumnOperatorTransformer( NewColumnNameMixin, TwoColumnMixin, DataFrameMethodTransformer, + BaseNumericTransformer, ): """This transformer applies a pandas.DataFrame method to two columns (add, sub, mul, div, mod, pow). @@ -384,11 +371,15 @@ class TwoColumnOperatorTransformer( polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework + FITS: bool + class attribute, indicates whether transform requires fit to be run first """ polars_compatible = False + FITS = False + def __init__( self, pd_method_name: str, @@ -437,14 +428,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: ------- pd.DataFrame: Input X with an additional column. """ - # call DataFrameMethodTransformer.transform + # call appropriate parent transforms X = super(DataFrameMethodTransformer, self).transform(X) - - is_numeric = X[self.columns].apply(pd.api.types.is_numeric_dtype, axis=0) - - if not is_numeric.all(): - msg = f"{self.classname()}: input columns in X must contain only numeric values" - raise TypeError(msg) + X = super(BaseNumericTransformer, self).transform(X) X[self.new_column_name] = getattr(X[[self.column1_name]], self.pd_method_name)( X[self.column2_name], @@ -480,11 +466,15 @@ class ScalingTransformer(BaseNumericTransformer): polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework + FITS: bool + class attribute, indicates whether transform requires fit to be run first """ polars_compatible = False + FITS = True + # Dictionary mapping scaler types to their corresponding sklearn classes scaler_options = { "min_max": MinMaxScaler, @@ -608,11 +598,15 @@ class InteractionTransformer(BaseNumericTransformer): number of total columns of transformed dataset, including new interaction features polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework + FITS: bool + class attribute, indicates whether transform requires fit to be run first """ polars_compatible = False + FITS = False + def __init__( self, columns: str | list[str] | None, @@ -716,7 +710,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: return X -class PCATransformer(CheckNumericMixin, BaseTransformer): +class PCATransformer(BaseNumericTransformer): """Transformer that generates variables using Principal component analysis (PCA). Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. @@ -783,12 +777,16 @@ class PCATransformer(CheckNumericMixin, BaseTransformer): list of feature name representing the new dimensions. polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework + FITS: bool + class attribute, indicates whether transform requires fit to be run first """ polars_compatible = False + FITS = True + def __init__( self, columns: str | list[str] | None,