Skip to content

Commit

Permalink
Merge pull request #339 from lvgig/feature/narwhalify_numericmixin
Browse files Browse the repository at this point in the history
refactored CheckNumericMixin
  • Loading branch information
limlam96 authored Nov 11, 2024
2 parents 7ce32de + f913f98 commit 10ced7a
Show file tree
Hide file tree
Showing 8 changed files with 269 additions and 77 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ Changed
- Refactored BaseImputer to utilise narwhals `#314 <https://github.com/lvgig/tubular/issues/314>_`
- Converted test dfs to flexible pandas/polars setup
- Converted BaseNominalTransformer to utilise narwhals `#334 <https://github.com/lvgig/tubular/issues/334>_`
- narwhalified CheckNumericMixin `#336 <https://github.com/lvgig/tubular/issues/336>_`
- placeholder
- placeholder
- placeholder
- placeholder
- placeholder
Expand Down
64 changes: 64 additions & 0 deletions tests/mixins/test_CheckNumericMixin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import re

import pytest

from tests.test_data import (
create_bool_and_float_df,
create_df_2,
create_df_with_none_and_nan_cols,
create_is_between_dates_df_1,
)
from tubular.mixins import CheckNumericMixin


class TestCheckNumericMixin:
"tests for CheckNumericMixin class"

@pytest.mark.parametrize("library", ["pandas", "polars"])
@pytest.mark.parametrize(
("df_generator", "bad_cols"),
[
(create_df_2, ["b", "c"]), # str
(create_is_between_dates_df_1, ["a"]), # datetime
(create_bool_and_float_df, ["b"]), # bool
(create_df_with_none_and_nan_cols, ["b"]), # None
],
)
def test_check_numeric_columns_errors(self, library, df_generator, bad_cols):
"test check_numeric_columns method raises appropriate error"

df = df_generator(library=library)

obj = CheckNumericMixin()

# this object is generally wrapped in a transformer with a .columns attr, set this here
obj.columns = bad_cols

with pytest.raises(
TypeError,
match=re.escape(
f"CheckNumericMixin: The following columns are not numeric in X; {bad_cols}",
),
):
obj.check_numeric_columns(df)

@pytest.mark.parametrize("library", ["pandas", "polars"])
@pytest.mark.parametrize(
("df_generator", "cols"),
[
(create_df_2, ["a"]), # int
(create_bool_and_float_df, ["a"]), # float
(create_df_with_none_and_nan_cols, ["a"]), # nan
],
)
def test_check_numeric_columns_passes(self, library, df_generator, cols):
"test check_numeric_columns method passes for numeric columns"

df = df_generator(library=library)

obj = CheckNumericMixin()

# this object is generally wrapped in a transformer with a .columns attr, set this here
obj.columns = cols

obj.check_numeric_columns(df)
99 changes: 92 additions & 7 deletions tests/numeric/test_BaseNumericTransformer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re

import pandas as pd
import pytest

import tests.test_data as d
Expand All @@ -19,18 +20,56 @@ class BaseNumericTransformerFitTests(GenericFitTests):
Note this deliberately avoids starting with "Tests" so that the tests are not run on import.
"""

def test_non_numeric_exception_raised(self, initialized_transformers):
@pytest.mark.parametrize(
("df_generator", "bad_cols"),
[
(d.create_df_2, ["b"]), # str
(d.create_is_between_dates_df_1, ["a"]), # datetime
(d.create_bool_and_float_df, ["b"]), # bool
(d.create_df_with_none_and_nan_cols, ["b"]), # None
],
)
def test_non_numeric_exception_raised(
self,
initialized_transformers,
df_generator,
bad_cols,
):
"""Test an exception is raised if self.columns are non-numeric in X."""
df = d.create_df_2()
df = df_generator()
# add in 'target column' for fit
df["c"] = [1] * len(df)

x = initialized_transformers[self.transformer_name]
x.columns = bad_cols

with pytest.raises(
TypeError,
match=rf"{self.transformer_name}: The following columns are not numeric in X; \['b'\]",
match=re.escape(
f"{self.transformer_name}: The following columns are not numeric in X; {bad_cols}",
),
):
x.fit(df, df["c"])

@pytest.mark.parametrize(
("df_generator", "cols"),
[
(d.create_df_2, ["a"]), # int
(d.create_bool_and_float_df, ["a"]), # float
(d.create_df_with_none_and_nan_cols, ["a"]), # nan
],
)
def test_numeric_passes(self, initialized_transformers, df_generator, cols):
"""Test check passes if self.columns numeric in X."""
df = df_generator()
# add in 'target column' for fit
df["c"] = [1] * len(df)

x = initialized_transformers[self.transformer_name]
x.columns = cols

x.fit(df, df["c"])


class BaseNumericTransformerTransformTests(
GenericTransformTests,
Expand All @@ -40,13 +79,34 @@ class BaseNumericTransformerTransformTests(
Note this deliberately avoids starting with "Tests" so that the tests are not run on import.
"""

def test_non_numeric_exception_raised(self, initialized_transformers):
@pytest.mark.parametrize(
("df_generator", "bad_cols"),
[
(d.create_df_2, ["b"]), # str
(d.create_is_between_dates_df_1, ["a"]), # datetime
(d.create_bool_and_float_df, ["b"]), # bool
(d.create_df_with_none_and_nan_cols, ["b"]), # None
],
)
def test_non_numeric_exception_raised(
self,
initialized_transformers,
df_generator,
bad_cols,
):
"""Test an exception is raised if self.columns are non-numeric in X."""
df = d.create_df_2()
# make df all non-numeric
df["a"] = df["b"]
df = df_generator()
# add in 'target column' for and additional numeric column fit
df["c"] = [1] * len(df)

x = initialized_transformers[self.transformer_name]
x.columns = bad_cols

# if the transformer fits, run a working fit before transform
if x.FITS:
# create numeric df to fit on
numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]})
x.fit(numeric_df, numeric_df["c"])

with pytest.raises(
TypeError,
Expand All @@ -56,6 +116,31 @@ def test_non_numeric_exception_raised(self, initialized_transformers):
):
x.transform(df)

@pytest.mark.parametrize(
("df_generator"),
[
d.create_df_2, # int
d.create_bool_and_float_df, # float
d.create_df_with_none_and_nan_cols, # nan
],
)
def test_numeric_passes(self, initialized_transformers, df_generator):
"""Test check passes if self.columns numeric in X."""
df = df_generator()
# add in 'target column' for and additional numeric column fit
df["c"] = [1] * len(df)
df["b"] = [1] * len(df)

x = initialized_transformers[self.transformer_name]
x.columns = ["a", "b"]

if x.FITS:
# create numeric df to fit on
numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]})
x.fit(numeric_df, numeric_df["c"])

x.transform(df)


class TestInit(BaseNumericTransformerInitTests):
"""Tests for BaseNumericTransformer.init()"""
Expand Down
23 changes: 23 additions & 0 deletions tests/numeric/test_CutTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,29 @@ def expected_df_1():

return df

@pytest.mark.parametrize(
("df_generator"),
[
d.create_df_2, # int
d.create_bool_and_float_df, # float
],
)
def test_numeric_passes(self, initialized_transformers, df_generator):
"""Test check passes if self.columns numeric in X - this transformer does not work on all null column
so overload this test"""
df = df_generator()
# add in 'target column' for and additional numeric column fit
df["c"] = [1] * len(df)
df["b"] = [1] * len(df)

x = initialized_transformers[self.transformer_name]
x.columns = ["a", "b"]

numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]})
x.fit(numeric_df, numeric_df["c"])

x.transform(df)

def test_output_from_cut_assigned_to_column(self, mocker):
"""Test that the output from pd.cut is assigned to column with name new_column_name."""
df = d.create_df_9()
Expand Down
19 changes: 4 additions & 15 deletions tests/numeric/test_TwoColumnOperatorTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@

import tests.test_data as d
from tests.base_tests import (
GenericTransformTests,
NewColumnNameInitMixintests,
OtherBaseBehaviourTests,
TwoColumnListInitTests,
)
from tests.numeric.test_BaseNumericTransformer import (
BaseNumericTransformerTransformTests,
)
from tubular.numeric import TwoColumnOperatorTransformer


Expand Down Expand Up @@ -40,7 +42,7 @@ def test_axis_not_valid_error(self):
)


class TestTransform(GenericTransformTests):
class TestTransform(BaseNumericTransformerTransformTests):
"""Tests for transformer.transform."""

@classmethod
Expand Down Expand Up @@ -74,19 +76,6 @@ def test_expected_output(self, pd_method_name, output):
msg_tag="TwoColumnMethod transformer does not produce the expected output",
)

def test_non_numeric_error(self):
x = TwoColumnOperatorTransformer(
"mul",
["a", "b"],
"c",
)

with pytest.raises(
TypeError,
match="TwoColumnOperatorTransformer: input columns in X must contain only numeric values",
):
x.transform(d.create_df_8())


class TestOtherBaseBehaviour(OtherBaseBehaviourTests):
"""
Expand Down
23 changes: 23 additions & 0 deletions tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import datetime

import narwhals as nw
import numpy as np
import pandas as pd
import polars as pl

Expand Down Expand Up @@ -211,6 +212,28 @@ def create_df_11(library="pandas"):
return u.dataframe_init_dispatch(df_dict, library=library)


def create_bool_and_float_df(library="pandas"):
"""Create simple DataFrame to use in other tests."""

df_dict = {
"a": [1.0, 2.0, np.nan],
"b": [True, False, None],
}

return u.dataframe_init_dispatch(df_dict, library=library)


def create_df_with_none_and_nan_cols(library="pandas"):
"""Create simple DataFrame to use in other tests."""

df_dict = {
"a": [np.nan, np.nan, np.nan],
"b": [None, None, None],
}

return u.dataframe_init_dispatch(df_dict, library=library)


def create_weighted_imputers_test_df(library="pandas"):
"""Create DataFrame to use imputer tests that correct values are imputed for weighted dataframes.
Expand Down
Loading

0 comments on commit 10ced7a

Please sign in to comment.