Merge pull request #339 from lvgig/feature/narwhalify_numericmixin

refactored CheckNumericMixin
lvgig · Nov 11, 2024 · 10ced7a · 10ced7a
2 parents 7ce32de + f913f98
commit 10ced7a
Show file tree

Hide file tree

Showing 8 changed files with 269 additions and 77 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -25,6 +25,9 @@ Changed
 - Refactored BaseImputer to utilise narwhals `#314 <https://github.com/lvgig/tubular/issues/314>_`
 - Converted test dfs to flexible pandas/polars setup
 - Converted BaseNominalTransformer to utilise narwhals `#334 <https://github.com/lvgig/tubular/issues/334>_`
+- narwhalified CheckNumericMixin `#336 <https://github.com/lvgig/tubular/issues/336>_`
+- placeholder
+- placeholder
 - placeholder
 - placeholder
 - placeholder

diff --git a/tests/mixins/test_CheckNumericMixin.py b/tests/mixins/test_CheckNumericMixin.py
@@ -0,0 +1,64 @@
+import re
+
+import pytest
+
+from tests.test_data import (
+    create_bool_and_float_df,
+    create_df_2,
+    create_df_with_none_and_nan_cols,
+    create_is_between_dates_df_1,
+)
+from tubular.mixins import CheckNumericMixin
+
+
+class TestCheckNumericMixin:
+    "tests for CheckNumericMixin class"
+
+    @pytest.mark.parametrize("library", ["pandas", "polars"])
+    @pytest.mark.parametrize(
+        ("df_generator", "bad_cols"),
+        [
+            (create_df_2, ["b", "c"]),  # str
+            (create_is_between_dates_df_1, ["a"]),  # datetime
+            (create_bool_and_float_df, ["b"]),  # bool
+            (create_df_with_none_and_nan_cols, ["b"]),  # None
+        ],
+    )
+    def test_check_numeric_columns_errors(self, library, df_generator, bad_cols):
+        "test check_numeric_columns method raises appropriate error"
+
+        df = df_generator(library=library)
+
+        obj = CheckNumericMixin()
+
+        # this object is generally wrapped in a transformer with a .columns attr, set this here
+        obj.columns = bad_cols
+
+        with pytest.raises(
+            TypeError,
+            match=re.escape(
+                f"CheckNumericMixin: The following columns are not numeric in X; {bad_cols}",
+            ),
+        ):
+            obj.check_numeric_columns(df)
+
+    @pytest.mark.parametrize("library", ["pandas", "polars"])
+    @pytest.mark.parametrize(
+        ("df_generator", "cols"),
+        [
+            (create_df_2, ["a"]),  # int
+            (create_bool_and_float_df, ["a"]),  # float
+            (create_df_with_none_and_nan_cols, ["a"]),  # nan
+        ],
+    )
+    def test_check_numeric_columns_passes(self, library, df_generator, cols):
+        "test check_numeric_columns method passes for numeric columns"
+
+        df = df_generator(library=library)
+
+        obj = CheckNumericMixin()
+
+        # this object is generally wrapped in a transformer with a .columns attr, set this here
+        obj.columns = cols
+
+        obj.check_numeric_columns(df)
diff --git a/tests/numeric/test_BaseNumericTransformer.py b/tests/numeric/test_BaseNumericTransformer.py
@@ -1,5 +1,6 @@
 import re
 
+import pandas as pd
 import pytest
 
 import tests.test_data as d
@@ -19,18 +20,56 @@ class BaseNumericTransformerFitTests(GenericFitTests):
     Note this deliberately avoids starting with "Tests" so that the tests are not run on import.
     """
 
-    def test_non_numeric_exception_raised(self, initialized_transformers):
+    @pytest.mark.parametrize(
+        ("df_generator", "bad_cols"),
+        [
+            (d.create_df_2, ["b"]),  # str
+            (d.create_is_between_dates_df_1, ["a"]),  # datetime
+            (d.create_bool_and_float_df, ["b"]),  # bool
+            (d.create_df_with_none_and_nan_cols, ["b"]),  # None
+        ],
+    )
+    def test_non_numeric_exception_raised(
+        self,
+        initialized_transformers,
+        df_generator,
+        bad_cols,
+    ):
         """Test an exception is raised if self.columns are non-numeric in X."""
-        df = d.create_df_2()
+        df = df_generator()
+        # add in 'target column' for fit
+        df["c"] = [1] * len(df)
 
         x = initialized_transformers[self.transformer_name]
+        x.columns = bad_cols
 
         with pytest.raises(
             TypeError,
-            match=rf"{self.transformer_name}: The following columns are not numeric in X; \['b'\]",
+            match=re.escape(
+                f"{self.transformer_name}: The following columns are not numeric in X; {bad_cols}",
+            ),
         ):
             x.fit(df, df["c"])
 
+    @pytest.mark.parametrize(
+        ("df_generator", "cols"),
+        [
+            (d.create_df_2, ["a"]),  # int
+            (d.create_bool_and_float_df, ["a"]),  # float
+            (d.create_df_with_none_and_nan_cols, ["a"]),  # nan
+        ],
+    )
+    def test_numeric_passes(self, initialized_transformers, df_generator, cols):
+        """Test check passes if self.columns numeric in X."""
+        df = df_generator()
+        # add in 'target column' for fit
+        df["c"] = [1] * len(df)
+
+        x = initialized_transformers[self.transformer_name]
+        x.columns = cols
+
+        x.fit(df, df["c"])
+
 
 class BaseNumericTransformerTransformTests(
     GenericTransformTests,
@@ -40,13 +79,34 @@ class BaseNumericTransformerTransformTests(
     Note this deliberately avoids starting with "Tests" so that the tests are not run on import.
     """
 
-    def test_non_numeric_exception_raised(self, initialized_transformers):
+    @pytest.mark.parametrize(
+        ("df_generator", "bad_cols"),
+        [
+            (d.create_df_2, ["b"]),  # str
+            (d.create_is_between_dates_df_1, ["a"]),  # datetime
+            (d.create_bool_and_float_df, ["b"]),  # bool
+            (d.create_df_with_none_and_nan_cols, ["b"]),  # None
+        ],
+    )
+    def test_non_numeric_exception_raised(
+        self,
+        initialized_transformers,
+        df_generator,
+        bad_cols,
+    ):
         """Test an exception is raised if self.columns are non-numeric in X."""
-        df = d.create_df_2()
-        # make df all non-numeric
-        df["a"] = df["b"]
+        df = df_generator()
+        # add in 'target column' for and additional numeric column fit
+        df["c"] = [1] * len(df)
 
         x = initialized_transformers[self.transformer_name]
+        x.columns = bad_cols
+
+        # if the transformer fits, run a working fit before transform
+        if x.FITS:
+            # create numeric df to fit on
+            numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]})
+            x.fit(numeric_df, numeric_df["c"])
 
         with pytest.raises(
             TypeError,
@@ -56,6 +116,31 @@ def test_non_numeric_exception_raised(self, initialized_transformers):
         ):
             x.transform(df)
 
+    @pytest.mark.parametrize(
+        ("df_generator"),
+        [
+            d.create_df_2,  # int
+            d.create_bool_and_float_df,  # float
+            d.create_df_with_none_and_nan_cols,  # nan
+        ],
+    )
+    def test_numeric_passes(self, initialized_transformers, df_generator):
+        """Test check passes if self.columns numeric in X."""
+        df = df_generator()
+        # add in 'target column' for and additional numeric column fit
+        df["c"] = [1] * len(df)
+        df["b"] = [1] * len(df)
+
+        x = initialized_transformers[self.transformer_name]
+        x.columns = ["a", "b"]
+
+        if x.FITS:
+            # create numeric df to fit on
+            numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]})
+            x.fit(numeric_df, numeric_df["c"])
+
+        x.transform(df)
+
 
 class TestInit(BaseNumericTransformerInitTests):
     """Tests for BaseNumericTransformer.init()"""

diff --git a/tests/numeric/test_CutTransformer.py b/tests/numeric/test_CutTransformer.py
@@ -65,6 +65,29 @@ def expected_df_1():
 
         return df
 
+    @pytest.mark.parametrize(
+        ("df_generator"),
+        [
+            d.create_df_2,  # int
+            d.create_bool_and_float_df,  # float
+        ],
+    )
+    def test_numeric_passes(self, initialized_transformers, df_generator):
+        """Test check passes if self.columns numeric in X - this transformer does not work on all null column
+        so overload this test"""
+        df = df_generator()
+        # add in 'target column' for and additional numeric column fit
+        df["c"] = [1] * len(df)
+        df["b"] = [1] * len(df)
+
+        x = initialized_transformers[self.transformer_name]
+        x.columns = ["a", "b"]
+
+        numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]})
+        x.fit(numeric_df, numeric_df["c"])
+
+        x.transform(df)
+
     def test_output_from_cut_assigned_to_column(self, mocker):
         """Test that the output from pd.cut is assigned to column with name new_column_name."""
         df = d.create_df_9()

diff --git a/tests/numeric/test_TwoColumnOperatorTransformer.py b/tests/numeric/test_TwoColumnOperatorTransformer.py
@@ -3,11 +3,13 @@
 
 import tests.test_data as d
 from tests.base_tests import (
-    GenericTransformTests,
     NewColumnNameInitMixintests,
     OtherBaseBehaviourTests,
     TwoColumnListInitTests,
 )
+from tests.numeric.test_BaseNumericTransformer import (
+    BaseNumericTransformerTransformTests,
+)
 from tubular.numeric import TwoColumnOperatorTransformer
 
 
@@ -40,7 +42,7 @@ def test_axis_not_valid_error(self):
             )
 
 
-class TestTransform(GenericTransformTests):
+class TestTransform(BaseNumericTransformerTransformTests):
     """Tests for transformer.transform."""
 
     @classmethod
@@ -74,19 +76,6 @@ def test_expected_output(self, pd_method_name, output):
             msg_tag="TwoColumnMethod transformer does not produce the expected output",
         )
 
-    def test_non_numeric_error(self):
-        x = TwoColumnOperatorTransformer(
-            "mul",
-            ["a", "b"],
-            "c",
-        )
-
-        with pytest.raises(
-            TypeError,
-            match="TwoColumnOperatorTransformer: input columns in X must contain only numeric values",
-        ):
-            x.transform(d.create_df_8())
-
 
 class TestOtherBaseBehaviour(OtherBaseBehaviourTests):
     """

diff --git a/tests/test_data.py b/tests/test_data.py
@@ -3,6 +3,7 @@
 import datetime
 
 import narwhals as nw
+import numpy as np
 import pandas as pd
 import polars as pl
 
@@ -211,6 +212,28 @@ def create_df_11(library="pandas"):
     return u.dataframe_init_dispatch(df_dict, library=library)
 
 
+def create_bool_and_float_df(library="pandas"):
+    """Create simple DataFrame to use in other tests."""
+
+    df_dict = {
+        "a": [1.0, 2.0, np.nan],
+        "b": [True, False, None],
+    }
+
+    return u.dataframe_init_dispatch(df_dict, library=library)
+
+
+def create_df_with_none_and_nan_cols(library="pandas"):
+    """Create simple DataFrame to use in other tests."""
+
+    df_dict = {
+        "a": [np.nan, np.nan, np.nan],
+        "b": [None, None, None],
+    }
+
+    return u.dataframe_init_dispatch(df_dict, library=library)
+
+
 def create_weighted_imputers_test_df(library="pandas"):
     """Create DataFrame to use imputer tests that correct values are imputed for weighted dataframes.