From b73683fc5eb7bff9075d2038ef3833a769815e12 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 17 Dec 2024 16:19:39 +0000 Subject: [PATCH 01/10] started narwhalifying basecappingtransformer --- tubular/capping.py | 183 ++++++++++++++++++++++----------------------- 1 file changed, 91 insertions(+), 92 deletions(-) diff --git a/tubular/capping.py b/tubular/capping.py index d2d2e9d6..88cbe313 100644 --- a/tubular/capping.py +++ b/tubular/capping.py @@ -4,16 +4,22 @@ import copy import warnings +from typing import TYPE_CHECKING +import narwhals as nw import numpy as np import pandas as pd -from tubular.base import BaseTransformer from tubular.mixins import WeightColumnMixin +from tubular.numeric import BaseNumericTransformer +if TYPE_CHECKING: + import pandas as pd + from narwhals.typing import FrameT -class BaseCappingTransformer(BaseTransformer, WeightColumnMixin): - polars_compatible = False + +class BaseCappingTransformer(BaseNumericTransformer, WeightColumnMixin): + polars_compatible = True def __init__( self, @@ -159,7 +165,8 @@ def check_capping_values_dict( msg = f"{self.classname()}: both values are None for key {k}" raise ValueError(msg) - def fit(self, X: pd.DataFrame, y: None = None) -> BaseCappingTransformer: + @nw.narwhalify + def fit(self, X: FrameT, y: None = None) -> BaseCappingTransformer: """Learn capping values from input data X. Calculates the quantiles to cap at given the quantiles dictionary supplied @@ -168,7 +175,7 @@ def fit(self, X: pd.DataFrame, y: None = None) -> BaseCappingTransformer: Parameters ---------- - X : pd.DataFrame + X : pd/pl.DataFrame A dataframe with required columns to be capped. y : None @@ -182,21 +189,29 @@ def fit(self, X: pd.DataFrame, y: None = None) -> BaseCappingTransformer: self.quantile_capping_values = {} + native_namespace = nw.get_native_namespace(X) + if self.quantiles is not None: for col in self.columns: if self.weights_column is None: - cap_values = self.prepare_quantiles( - X[col], - self.quantiles[col], - self.weights_column, + weights_column = "dummy_weights_column" + X = X.with_columns( + nw.new_series( + name="dummy_weights_column", + values=[1] * len(X), + native_namespace=native_namespace, + ), ) else: - cap_values = self.prepare_quantiles( - X[col], - self.quantiles[col], - X[self.weights_column], - ) + weights_column = self.weights_column + + cap_values = self.prepare_quantiles( + X, + self.quantiles[col], + values_column=col, + weights_column=weights_column, + ) self.quantile_capping_values[col] = cap_values @@ -208,11 +223,13 @@ def fit(self, X: pd.DataFrame, y: None = None) -> BaseCappingTransformer: return self + @nw.narwhalify def prepare_quantiles( self, - values: pd.Series | np.array, + X: FrameT, quantiles: list[float], - sample_weight: pd.Series | np.array | None = None, + values_column: str, + weights_column: str, ) -> list[int | float]: """Method to call the weighted_quantile method and prepare the outputs. @@ -223,15 +240,17 @@ def prepare_quantiles( Parameters ---------- - values : pd.Series or np.array - A dataframe column with values to calculate quantiles from. + X : FrameT + Dataframe with relevant columns to calculate quantiles from. quantiles : None Weighted quantiles to calculate. Must all be between 0 and 1. - sample_weight : pd.Series or np.array or None, default = None - Sample weights for each item in values, must be the same lenght as values. If - not supplied then unit weights will be used. + values_col: str + name of relevant values column in data + + weights_column: str + name of relevant weight column in data Returns ------- @@ -242,27 +261,44 @@ def prepare_quantiles( if quantiles[0] is None: quantiles = np.array([quantiles[1]]) - results_no_none = self.weighted_quantile(values, quantiles, sample_weight) + results_no_none = self.weighted_quantile( + X, + quantiles, + values_column=values_column, + weights_column=weights_column, + ) results = [None] + results_no_none elif quantiles[1] is None: quantiles = np.array([quantiles[0]]) - results_no_none = self.weighted_quantile(values, quantiles, sample_weight) + results_no_none = self.weighted_quantile( + X, + quantiles, + values_column=values_column, + weights_column=weights_column, + ) results = results_no_none + [None] else: - results = self.weighted_quantile(values, quantiles, sample_weight) + results = self.weighted_quantile( + X, + quantiles, + values_column=values_column, + weights_column=weights_column, + ) return results + @nw.narwhalify def weighted_quantile( self, - values: pd.Series | np.array, + X: FrameT, quantiles: list[float], - sample_weight: pd.Series | np.array | None = None, + values_column: str, + weights_column: str, ) -> list[int | float]: """Method to calculate weighted quantiles. @@ -280,15 +316,17 @@ def weighted_quantile( Parameters ---------- - values : pd.Series or np.array - A dataframe column with values to calculate quantiles from. + X : FrameT + Dataframe with relevant columns to calculate quantiles from. quantiles : None Weighted quantiles to calculate. Must all be between 0 and 1. - sample_weight : pd.Series or np.array or None, default = None - Sample weights for each item in values, must be the same lenght as values. If - not supplied then unit weights will be used. + values_col: str + name of relevant values column in data + + weights_column: str + name of relevant weight column in data Returns ------- @@ -320,48 +358,28 @@ def weighted_quantile( [1.0, 2.0, 5.0] """ - if sample_weight is None: - sample_weight = np.ones(len(values)) - else: - sample_weight = np.array(sample_weight) - - if np.isnan(sample_weight).sum() > 0: - msg = f"{self.classname()}: sample weights values cannot be null" - raise ValueError(msg) - - if np.isinf(sample_weight).sum() > 0: - msg = f"{self.classname()}: sample weights values cannot be inf" - raise ValueError(msg) - - if (sample_weight < 0).sum() > 0: - msg = f"{self.classname()}: sample weights values cannot be negative" - raise ValueError(msg) - - if sample_weight.sum() <= 0: - msg = f"{self.classname()}: total sample weights are not greater than 0" - raise ValueError(msg) - - values = np.array(values) quantiles = np.array(quantiles) - nan_filter = ~np.isnan(values) - values = values[nan_filter] - sample_weight = sample_weight[nan_filter] + nan_filter = X.select(~nw.col(values_column).is_null()) + X = X.filter(nan_filter) - zero_weight_filter = ~(sample_weight == 0) - values = values[zero_weight_filter] - sample_weight = sample_weight[zero_weight_filter] + zero_weight_filter = X.select(~nw.col(weights_column) == 0) + X.filter(zero_weight_filter) - sorter = np.argsort(values, kind="stable") - values = values[sorter] - sample_weight = sample_weight[sorter] + X = X.sort(by=values_column, descending=False) - weighted_quantiles = np.cumsum(sample_weight) - weighted_quantiles = weighted_quantiles / np.sum(sample_weight) + weighted_quantiles = X.select( + (nw.col(weights_column).cum_sum()) / (nw.col(weights_column).sum()), + ) + # TODO - once narwhals implements interpolate, replace this with nw + # syntax + weighted_quantiles = weighted_quantiles.to_numpy() + values = X.get_column(values_column).to_numpy() return list(np.interp(quantiles, weighted_quantiles, values)) - def transform(self, X: pd.DataFrame) -> pd.DataFrame: + @nw.narwhalify + def transform(self, X: FrameT) -> FrameT: """Apply capping to columns in X. If cap_value_max is set, any values above cap_value_max will be set to cap_value_max. If cap_value_min @@ -369,12 +387,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: Parameters ---------- - X : pd.DataFrame + X : pd/pl.DataFrame Data to apply capping to. Returns ------- - X : pd.DataFrame + X : pd/pl.DataFrame Transformed input X with min and max capping applied to the specified columns. """ @@ -388,13 +406,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if self.quantiles: self.check_is_fitted(["quantile_capping_values"]) - capping_values_for_transform = self.quantile_capping_values - dict_attrs = dict_attrs + ["quantile_capping_values"] else: - capping_values_for_transform = self.capping_values - dict_attrs = dict_attrs + ["capping_values"] for attr_name in dict_attrs: @@ -402,31 +416,16 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: msg = f"{self.classname()}: {attr_name} attribute is an empty dict - perhaps the fit method has not been run yet" raise ValueError(msg) - numeric_column_types = X[self.columns].apply( - pd.api.types.is_numeric_dtype, - axis=0, - ) - - if not numeric_column_types.all(): - non_numeric_columns = list( - numeric_column_types.loc[~numeric_column_types].index, - ) - - msg = f"{self.classname()}: The following columns are not numeric in X; {non_numeric_columns}" - raise TypeError(msg) - for col in self.columns: - cap_value_min = capping_values_for_transform[col][0] - cap_value_max = capping_values_for_transform[col][1] - replacement_min = self._replacement_values[col][0] replacement_max = self._replacement_values[col][1] - if cap_value_min is not None: - X.loc[X[col] < cap_value_min, col] = replacement_min - - if cap_value_max is not None: - X.loc[X[col] > cap_value_max, col] = replacement_max + X = X.with_columns( + nw.col(col).clip( + lower_bound=replacement_min, + upper_bound=replacement_max, + ), + ) return X From ad5461e6e1f6b31f7f82d81f46d43bb2573cd3b3 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 17 Dec 2024 16:31:05 +0000 Subject: [PATCH 02/10] narwhalified BaseNumericTransformer --- CHANGELOG.rst | 2 +- tests/numeric/test_BaseNumericTransformer.py | 86 ++++++++++++++++---- tubular/numeric.py | 22 +++-- 3 files changed, 88 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 71e267c0..b7824ab1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -27,7 +27,7 @@ Changed - fixed issues with all null and nullable-bool column handling in dataframe_init_dispatch - added NaN error handling to WeightColumnMixin - narwhalified MeanImputer `#344 https://github.com/lvgig/tubular/issues/344_` -- placeholder +- narwhalified BaseNumericTransformer `#358 https://github.com/lvgig/tubular/issues/358` - placeholder - placeholder diff --git a/tests/numeric/test_BaseNumericTransformer.py b/tests/numeric/test_BaseNumericTransformer.py index 72f2c3f9..da58c01f 100644 --- a/tests/numeric/test_BaseNumericTransformer.py +++ b/tests/numeric/test_BaseNumericTransformer.py @@ -1,10 +1,11 @@ import re -import pandas as pd +import narwhals as nw import pytest import tests.test_data as d from tests.base_tests import GenericFitTests, GenericInitTests, GenericTransformTests +from tests.utils import dataframe_init_dispatch class BaseNumericTransformerInitTests(GenericInitTests): @@ -20,6 +21,7 @@ class BaseNumericTransformerFitTests(GenericFitTests): Note this deliberately avoids starting with "Tests" so that the tests are not run on import. """ + @pytest.mark.parametrize("library", ["pandas", "polars"]) @pytest.mark.parametrize( ("df_generator", "bad_cols"), [ @@ -34,11 +36,21 @@ def test_non_numeric_exception_raised( initialized_transformers, df_generator, bad_cols, + library, ): """Test an exception is raised if self.columns are non-numeric in X.""" - df = df_generator() + df = df_generator(library=library) + # add in 'target column' for fit - df["c"] = [1] * len(df) + df = nw.from_native(df) + native_namespace = nw.get_native_namespace(df) + df = df.with_columns( + nw.new_series( + name="c", + values=[1] * len(df), + native_namespace=native_namespace, + ), + ).to_native() x = initialized_transformers[self.transformer_name] x.columns = bad_cols @@ -51,6 +63,7 @@ def test_non_numeric_exception_raised( ): x.fit(df, df["c"]) + @pytest.mark.parametrize("library", ["pandas", "polars"]) @pytest.mark.parametrize( ("df_generator", "cols"), [ @@ -59,11 +72,25 @@ def test_non_numeric_exception_raised( (d.create_df_with_none_and_nan_cols, ["a"]), # nan ], ) - def test_numeric_passes(self, initialized_transformers, df_generator, cols): + def test_numeric_passes( + self, + initialized_transformers, + df_generator, + cols, + library, + ): """Test check passes if self.columns numeric in X.""" - df = df_generator() + df = df_generator(library=library) # add in 'target column' for fit - df["c"] = [1] * len(df) + df = nw.from_native(df) + native_namespace = nw.get_native_namespace(df) + df = df.with_columns( + nw.new_series( + name="c", + values=[1] * len(df), + native_namespace=native_namespace, + ), + ).to_native() x = initialized_transformers[self.transformer_name] x.columns = cols @@ -79,6 +106,7 @@ class BaseNumericTransformerTransformTests( Note this deliberately avoids starting with "Tests" so that the tests are not run on import. """ + @pytest.mark.parametrize("library", ["pandas", "polars"]) @pytest.mark.parametrize( ("df_generator", "bad_cols"), [ @@ -93,11 +121,20 @@ def test_non_numeric_exception_raised( initialized_transformers, df_generator, bad_cols, + library, ): """Test an exception is raised if self.columns are non-numeric in X.""" - df = df_generator() + df = df_generator(library=library) # add in 'target column' for and additional numeric column fit - df["c"] = [1] * len(df) + df = nw.from_native(df) + native_namespace = nw.get_native_namespace(df) + df = df.with_columns( + nw.new_series( + name="c", + values=[1] * len(df), + native_namespace=native_namespace, + ), + ).to_native() x = initialized_transformers[self.transformer_name] x.columns = bad_cols @@ -105,7 +142,11 @@ def test_non_numeric_exception_raised( # if the transformer fits, run a working fit before transform if x.FITS: # create numeric df to fit on - numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]}) + df_dict = {col: df["c"] for col in [*x.columns, "c"]} + numeric_df = dataframe_init_dispatch( + dataframe_dict=df_dict, + library=library, + ) x.fit(numeric_df, numeric_df["c"]) with pytest.raises( @@ -116,6 +157,7 @@ def test_non_numeric_exception_raised( ): x.transform(df) + @pytest.mark.parametrize("library", ["pandas", "polars"]) @pytest.mark.parametrize( ("df_generator"), [ @@ -124,19 +166,35 @@ def test_non_numeric_exception_raised( d.create_df_with_none_and_nan_cols, # nan ], ) - def test_numeric_passes(self, initialized_transformers, df_generator): + def test_numeric_passes(self, initialized_transformers, df_generator, library): """Test check passes if self.columns numeric in X.""" - df = df_generator() + df = df_generator(library=library) # add in 'target column' for and additional numeric column fit - df["c"] = [1] * len(df) - df["b"] = [1] * len(df) + df = nw.from_native(df) + native_namespace = nw.get_native_namespace(df) + df = df.with_columns( + nw.new_series( + name="c", + values=[1] * len(df), + native_namespace=native_namespace, + ), + nw.new_series( + name="b", + values=[1] * len(df), + native_namespace=native_namespace, + ), + ).to_native() x = initialized_transformers[self.transformer_name] x.columns = ["a", "b"] if x.FITS: # create numeric df to fit on - numeric_df = pd.DataFrame({col: df["c"] for col in [*x.columns, "c"]}) + df_dict = {col: df["c"] for col in [*x.columns, "c"]} + numeric_df = dataframe_init_dispatch( + dataframe_dict=df_dict, + library=library, + ) x.fit(numeric_df, numeric_df["c"]) x.transform(df) diff --git a/tubular/numeric.py b/tubular/numeric.py index e93f77f9..d8c34cb9 100644 --- a/tubular/numeric.py +++ b/tubular/numeric.py @@ -2,6 +2,9 @@ from __future__ import annotations +from typing import TYPE_CHECKING + +import narwhals as nw import numpy as np import pandas as pd from sklearn.decomposition import PCA @@ -20,6 +23,9 @@ TwoColumnMixin, ) +if TYPE_CHECKING: + from narwhals.typing import FrameT + class BaseNumericTransformer(BaseTransformer, CheckNumericMixin): """ @@ -45,23 +51,24 @@ class attribute, indicates whether transform requires fit to be run first """ - polars_compatible = False + polars_compatible = True FITS = False def __init__(self, columns: list[str], **kwargs: dict[str, bool]) -> None: super().__init__(columns=columns, **kwargs) + @nw.narwhalify def fit( self, - X: pd.DataFrame, - y: pd.Series | None = None, + X: FrameT, + y: nw.Series | None = None, ) -> BaseNumericTransformer: """Base fit method. Validates data and attributes prior to the child objects fit logic. Parameters ---------- - X : pd.DataFrame + X : pd/pl.DataFrame A dataframe containing the required columns y : None @@ -75,17 +82,18 @@ def fit( return self - def transform(self, X: pd.DataFrame) -> pd.DataFrame: + @nw.narwhalify + def transform(self, X: FrameT) -> FrameT: """Base transform method. Validates data and attributes prior to the child objects tranform logic. Parameters ---------- - X : pd.DataFrame + X : pd/pl.DataFrame Data to transform. Returns ------- - X : pd.DataFrame + X : pd/pl.DataFrame Validated data """ From d879b18e8b04ec6c1946f3f02d73690e7a73707c Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 17 Dec 2024 17:06:01 +0000 Subject: [PATCH 03/10] continued work to narwhalify basecappingtransformer --- CHANGELOG.rst | 8 +- tests/capping/test_BaseCappingTransformer.py | 168 ++++++++++++++----- tubular/capping.py | 9 +- 3 files changed, 141 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b7824ab1..47ac3951 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -26,8 +26,12 @@ Changed - narwhalified ModeImputer `#321 _` - fixed issues with all null and nullable-bool column handling in dataframe_init_dispatch - added NaN error handling to WeightColumnMixin -- narwhalified MeanImputer `#344 https://github.com/lvgig/tubular/issues/344_` -- narwhalified BaseNumericTransformer `#358 https://github.com/lvgig/tubular/issues/358` +- narwhalified MeanImputer `#344 _` +- narwhalified BaseNumericTransformer `#358 _` +- narwhalified BaseCappingTransformer `#357 _` +- placeholder +- placeholder +- placeholder - placeholder - placeholder diff --git a/tests/capping/test_BaseCappingTransformer.py b/tests/capping/test_BaseCappingTransformer.py index 5156c387..137112df 100644 --- a/tests/capping/test_BaseCappingTransformer.py +++ b/tests/capping/test_BaseCappingTransformer.py @@ -1,9 +1,10 @@ import re +import narwhals as nw import numpy as np import pandas as pd +import polars as pl import pytest -import test_aide as ta import tests.test_data as d from tests.base_tests import ( @@ -13,6 +14,7 @@ WeightColumnFitMixinTests, WeightColumnInitMixinTests, ) +from tests.utils import assert_frame_equal_dispatch, dataframe_init_dispatch from tubular.capping import BaseCappingTransformer @@ -344,57 +346,71 @@ class GenericCappingTransformTests(GenericTransformTests): def setup_class(cls): cls.transformer_name = "BaseCappingTransformer" - def expected_df_2(): + def expected_df_2(library="pandas"): """Expected output from test_expected_output_max.""" - df = pd.DataFrame( - { - "a": [2, 2, 3, 4, 5, 6, 7, np.nan], - "b": ["a", "b", "c", "d", "e", "f", "g", np.nan], - "c": ["a", "b", "c", "d", "e", "f", "g", np.nan], - }, - ) - df["c"] = df["c"].astype("category") + df_dict = { + "a": [2, 2, 3, 4, 5, 6, 7, None], + "b": ["a", "b", "c", "d", "e", "f", "g", None], + "c": ["a", "b", "c", "d", "e", "f", "g", None], + } - return df + df = dataframe_init_dispatch(dataframe_dict=df_dict, library=library) - @pytest.mark.parametrize( - ("df", "expected"), - ta.pandas.adjusted_dataframe_params(d.create_df_4(), expected_df_2()), - ) + df = nw.from_native(df) + df = df.with_columns(nw.col("c").cast(nw.Categorical)) + + return df.to_native() + + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_non_cap_column_left_untouched( self, df, expected, initialized_transformers, + library, ): """Test that capping is applied only to specific columns, others remain the same.""" + df = d.create_df_4(library=library) + transformer = initialized_transformers[self.transformer_name] + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return + transformer.fit(df) df_transformed = transformer.transform(df) - non_capped_df = df_transformed.drop("a", axis=1) - non_capped_expected = expected.drop("a", axis=1) + expected = self.expected_df_2(library=library) - ta.equality.assert_frame_equal_msg( - actual=non_capped_df, - expected=non_capped_expected, - msg_tag=f"Unexpected values in {self.transformer_name}.transform, with columns meant to not be transformed", - ) + expected = nw.from_native(expected).drop("a").to_native() + df_transformed = nw.from_native(df_transformed).drop("a").to_native() + + assert_frame_equal_dispatch(df_transformed, expected) + @pytest.mark.parametrize("library", ["pandas", "polars"]) @pytest.mark.parametrize( "fit_value", ["_replacement_values", "capping_values"], ) - def test_learnt_values_not_modified(self, fit_value, initialized_transformers): + def test_learnt_values_not_modified( + self, + fit_value, + initialized_transformers, + library, + ): """Test that the replacements from fit are not changed in transform.""" transformer = initialized_transformers[self.transformer_name] - df = d.create_df_3() + df = d.create_df_3(library=library) + + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return transformer.fit(df) @@ -402,16 +418,18 @@ def test_learnt_values_not_modified(self, fit_value, initialized_transformers): transformer.transform(df) - ta.classes.test_object_attributes( - obj=transformer, - expected_attributes={fit_value: learnt_values}, - msg=f"learnt attribute {fit_value} for {self.transformer_name} changed in transform", - ) + new_learnt_values = getattr(transformer, fit_value) + assert ( + learnt_values == new_learnt_values + ), f"learnt_value {fit_value} changed by transform, expected {learnt_values} but got {new_learnt_values}" + + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_non_numeric_column_error( self, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that transform will raise an error if a column to transform is not numeric.""" @@ -420,7 +438,11 @@ def test_non_numeric_column_error( transformer = uninitialized_transformers[self.transformer_name](**args) - df = d.create_df_5() + df = d.create_df_5(library=library) + + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return transformer.fit(df) @@ -430,13 +452,15 @@ def test_non_numeric_column_error( ): transformer.transform(df) + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_quantile_capping_values_not_fit_error( self, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that transform will raise an error if capping_values attr has not fit""" - df = d.create_df_9() + df = d.create_df_9(library=library) args = minimal_attribute_dict[self.transformer_name].copy() args["quantiles"] = {"a": [0.1, 0.2]} @@ -444,25 +468,36 @@ def test_quantile_capping_values_not_fit_error( transformer = uninitialized_transformers[self.transformer_name](**args) + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return + with pytest.raises( ValueError, match=f"This {self.transformer_name} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator", ): transformer.transform(df) + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_quantile_capping_values_empty_error( self, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that transform will raise an error if quantile_capping_values is empty dict""" - df = d.create_df_9() + df = d.create_df_9(library=library) args = minimal_attribute_dict[self.transformer_name].copy() args["quantiles"] = {"a": [0.1, 0.2]} args["capping_values"] = None transformer = uninitialized_transformers[self.transformer_name](**args) + + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return + transformer.fit(df) transformer.quantile_capping_values = {} @@ -472,18 +507,25 @@ def test_quantile_capping_values_empty_error( ): transformer.transform(df) + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_capping_values_empty_error( self, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that transform will raise an error if capping_values is empty dict""" - df = d.create_df_9() + df = d.create_df_9(library=library) args = minimal_attribute_dict[self.transformer_name].copy() args["capping_values"] = {"a": [0.1, 0.2]} transformer = uninitialized_transformers[self.transformer_name](**args) + + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return + transformer.fit(df) transformer.capping_values = {} @@ -493,13 +535,15 @@ def test_capping_values_empty_error( ): transformer.transform(df) + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_replacement_values_not_fit_error( self, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that transform will raise an error if replacement values attr has not fit""" - df = d.create_df_9() + df = d.create_df_9(library=library) args = minimal_attribute_dict[self.transformer_name].copy() args["quantiles"] = {"a": [0.1, 0.2]} @@ -507,19 +551,25 @@ def test_replacement_values_not_fit_error( transformer = uninitialized_transformers[self.transformer_name](**args) + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return + with pytest.raises( ValueError, match=f"This {self.transformer_name} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator", ): transformer.transform(df) + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_replacement_values_dict_empty_error( self, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that transform will raise an error if _replacement_values is an empty dict.""" - df = d.create_df_9() + df = d.create_df_9(library=library) args = minimal_attribute_dict[self.transformer_name].copy() args["quantiles"] = {"a": [0.1, 0.2]} @@ -527,6 +577,10 @@ def test_replacement_values_dict_empty_error( transformer = uninitialized_transformers[self.transformer_name](**args) + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return + # manually set attribute to get past the capping_values attribute is an empty dict exception transformer.quantile_capping_values = {"a": [1, 4]} transformer._replacement_values = {} @@ -537,13 +591,15 @@ def test_replacement_values_dict_empty_error( ): transformer.transform(df) + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_fixed_attributes_unchanged_from_transform( self, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that attributes are unchanged after transform is run.""" - df = d.create_df_9() + df = d.create_df_9(library=library) args = minimal_attribute_dict[self.transformer_name].copy() args["quantiles"] = {"a": [0.2, 1], "b": [0, 1]} @@ -551,6 +607,10 @@ def test_fixed_attributes_unchanged_from_transform( transformer = uninitialized_transformers[self.transformer_name](**args) + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return + transformer.fit(df) transformer2 = uninitialized_transformers[self.transformer_name](**args) @@ -567,9 +627,28 @@ def test_fixed_attributes_unchanged_from_transform( ), "quantiles attribute modified in transform" +class TestBaseCappingTransformerInit(GenericCappingInitTests): + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseCappingTransformer" + + +class TestBaseCappingTransformerFit(GenericCappingFitTests): + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseCappingTransformer" + + +class TestBaseCappingTransformerTransform(GenericCappingTransformTests): + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseCappingTransformer" + + class TestWeightedQuantile: """Tests for the BaseCappingTransformer.weighted_quantile method.""" + @pytest.mark.parametrize("library", ["pandas", "polars"]) @pytest.mark.parametrize( ("values", "sample_weight", "quantiles", "expected_quantiles"), [ @@ -606,13 +685,26 @@ def test_expected_output( sample_weight, quantiles, expected_quantiles, + library, ): """Test that weighted_quantile gives the expected outputs.""" x = BaseCappingTransformer(capping_values={"a": [2, 10]}) - values = pd.Series(values) + values_col = "values" + weights_col = "weight" + df_dict = { + values_col: values, + weights_col: sample_weight, + } - actual = x.weighted_quantile(values, quantiles, sample_weight) + df = dataframe_init_dispatch(dataframe_dict=df_dict, library=library) + + actual = x.weighted_quantile( + df, + quantiles, + values_column=values_col, + weights_column=weights_col, + ) # round to 1dp to avoid mismatches due to numerical precision actual_rounded_1_dp = list(np.round(actual, 1)) diff --git a/tubular/capping.py b/tubular/capping.py index 88cbe313..c750f2c6 100644 --- a/tubular/capping.py +++ b/tubular/capping.py @@ -360,11 +360,11 @@ def weighted_quantile( """ quantiles = np.array(quantiles) - nan_filter = X.select(~nw.col(values_column).is_null()) + nan_filter = ~(nw.col(values_column).is_null()) X = X.filter(nan_filter) - zero_weight_filter = X.select(~nw.col(weights_column) == 0) - X.filter(zero_weight_filter) + zero_weight_filter = ~(nw.col(weights_column) == 0) + X = X.filter(zero_weight_filter) X = X.sort(by=values_column, descending=False) @@ -374,8 +374,9 @@ def weighted_quantile( # TODO - once narwhals implements interpolate, replace this with nw # syntax - weighted_quantiles = weighted_quantiles.to_numpy() + weighted_quantiles = weighted_quantiles.get_column(weights_column).to_numpy() values = X.get_column(values_column).to_numpy() + return list(np.interp(quantiles, weighted_quantiles, values)) @nw.narwhalify From ca6eab8d893fb9ef556f42e0ee620617103c4498 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Wed, 18 Dec 2024 09:50:43 +0000 Subject: [PATCH 04/10] continued work on BaseCappingTransformer --- tests/capping/test_BaseCappingTransformer.py | 49 ++++++++++++++------ tubular/capping.py | 13 +++--- tubular/numeric.py | 4 +- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/tests/capping/test_BaseCappingTransformer.py b/tests/capping/test_BaseCappingTransformer.py index 137112df..d372d0a3 100644 --- a/tests/capping/test_BaseCappingTransformer.py +++ b/tests/capping/test_BaseCappingTransformer.py @@ -2,7 +2,6 @@ import narwhals as nw import numpy as np -import pandas as pd import polars as pl import pytest @@ -238,10 +237,12 @@ class GenericCappingFitTests(WeightColumnFitMixinTests, GenericFitTests): def setup_class(cls): cls.transformer_name = "BaseCappingTransformer" + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_quantiles_none_error( self, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that a warning is raised if quantiles is None when fit is run.""" @@ -250,14 +251,19 @@ def test_quantiles_none_error( transformer = uninitialized_transformers[self.transformer_name](**args) + df = d.create_df_3(library=library) + + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return + with pytest.warns( UserWarning, match=f"{self.transformer_name}: quantiles not set so no fitting done", ): - df = d.create_df_3() - transformer.fit(df) + @pytest.mark.parametrize("library", ["pandas", "polars"]) @pytest.mark.parametrize( ("values", "sample_weight", "quantiles", "expected_quantiles"), # quantiles use linear interpolation, which is manually replicated here where needed @@ -299,6 +305,7 @@ def test_fit_values( expected_quantiles, minimal_attribute_dict, uninitialized_transformers, + library, ): """Test that weighted_quantile gives the expected outputs.""" @@ -309,15 +316,20 @@ def test_fit_values( transformer = uninitialized_transformers[self.transformer_name](**args) - if not sample_weight: - sample_weight = [1] * len(values) + df_dict = { + "a": values, + } + if sample_weight: + df_dict["w"] = sample_weight - df = pd.DataFrame( - { - "a": values, - "w": sample_weight, - }, - ) + else: + transformer.weights_column = None + + df = dataframe_init_dispatch(dataframe_dict=df_dict, library=library) + + # if transformer is not polars compatible, skip polars test + if not transformer.polars_compatible and isinstance(df, pl.DataFrame): + return transformer.fit(df) @@ -434,7 +446,7 @@ def test_non_numeric_column_error( """Test that transform will raise an error if a column to transform is not numeric.""" args = minimal_attribute_dict[self.transformer_name].copy() - args["capping_values"] = {"c": [1, 2]} + args["capping_values"] = {"a": [1, 2]} transformer = uninitialized_transformers[self.transformer_name](**args) @@ -446,9 +458,20 @@ def test_non_numeric_column_error( transformer.fit(df) + # convert column to non-numeric + df = nw.from_native(df) + native_namespace = nw.get_native_namespace(df) + df = df.with_columns( + nw.new_series( + name="a", + values=["a"] * len(df), + native_namespace=native_namespace, + ), + ) + with pytest.raises( TypeError, - match=rf"{self.transformer_name}: The following columns are not numeric in X; \['c'\]", + match=rf"{self.transformer_name}: The following columns are not numeric in X; \['a'\]", ): transformer.transform(df) diff --git a/tubular/capping.py b/tubular/capping.py index c750f2c6..8208435c 100644 --- a/tubular/capping.py +++ b/tubular/capping.py @@ -110,6 +110,9 @@ class attribute, indicates whether transformer has been converted to polars/pand self.capping_values = capping_values WeightColumnMixin.check_and_set_weight(self, weights_column) + if capping_values: + self._replacement_values = copy.deepcopy(self.capping_values) + def check_capping_values_dict( self, capping_values_dict: dict[str, list[int | float | None]], @@ -215,6 +218,8 @@ def fit(self, X: FrameT, y: None = None) -> BaseCappingTransformer: self.quantile_capping_values[col] = cap_values + self._replacement_values = copy.deepcopy(self.quantile_capping_values) + else: warnings.warn( f"{self.classname()}: quantiles not set so no fitting done in CappingTransformer", @@ -398,7 +403,7 @@ def transform(self, X: FrameT) -> FrameT: """ - X = super().transform(X) + X = nw.from_native(super().transform(X)) self.check_is_fitted(["_replacement_values"]) @@ -498,9 +503,6 @@ def __init__( ) -> None: super().__init__(capping_values, quantiles, weights_column, **kwargs) - if capping_values: - self._replacement_values = copy.deepcopy(self.capping_values) - def fit(self, X: pd.DataFrame, y: None = None) -> CappingTransformer: """Learn capping values from input data X. @@ -519,9 +521,6 @@ def fit(self, X: pd.DataFrame, y: None = None) -> CappingTransformer: """ super().fit(X, y) - if self.quantiles is not None: - self._replacement_values = copy.deepcopy(self.quantile_capping_values) - return self diff --git a/tubular/numeric.py b/tubular/numeric.py index d8c34cb9..5ab8013e 100644 --- a/tubular/numeric.py +++ b/tubular/numeric.py @@ -78,7 +78,7 @@ def fit( super().fit(X, y) - CheckNumericMixin.check_numeric_columns(self, X) + CheckNumericMixin.check_numeric_columns(self, X[self.columns]) return self @@ -100,7 +100,7 @@ def transform(self, X: FrameT) -> FrameT: X = super().transform(X) - CheckNumericMixin.check_numeric_columns(self, X) + CheckNumericMixin.check_numeric_columns(self, X[self.columns]) return X From e813b97d80e8f11793e6488c6957a15a5d9db655 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:18:35 +0000 Subject: [PATCH 05/10] continued work on BaseCappingTransformer --- tests/capping/test_BaseCappingTransformer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/capping/test_BaseCappingTransformer.py b/tests/capping/test_BaseCappingTransformer.py index d372d0a3..56ec28e5 100644 --- a/tests/capping/test_BaseCappingTransformer.py +++ b/tests/capping/test_BaseCappingTransformer.py @@ -358,7 +358,7 @@ class GenericCappingTransformTests(GenericTransformTests): def setup_class(cls): cls.transformer_name = "BaseCappingTransformer" - def expected_df_2(library="pandas"): + def expected_df_2(self, library="pandas"): """Expected output from test_expected_output_max.""" df_dict = { @@ -377,8 +377,6 @@ def expected_df_2(library="pandas"): @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_non_cap_column_left_untouched( self, - df, - expected, initialized_transformers, library, ): From 4687e38c690daf322b45f34cb7ae6e322e464be9 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Wed, 18 Dec 2024 10:21:38 +0000 Subject: [PATCH 06/10] made polars tests optional for BaseNumericTransformer --- tests/numeric/test_BaseNumericTransformer.py | 44 ++++++++++++++------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/tests/numeric/test_BaseNumericTransformer.py b/tests/numeric/test_BaseNumericTransformer.py index da58c01f..7a3d48c1 100644 --- a/tests/numeric/test_BaseNumericTransformer.py +++ b/tests/numeric/test_BaseNumericTransformer.py @@ -1,6 +1,7 @@ import re import narwhals as nw +import polars as pl import pytest import tests.test_data as d @@ -41,6 +42,13 @@ def test_non_numeric_exception_raised( """Test an exception is raised if self.columns are non-numeric in X.""" df = df_generator(library=library) + x = initialized_transformers[self.transformer_name] + x.columns = bad_cols + + # if transformer is not polars compatible, skip polars test + if not x.polars_compatible and isinstance(df, pl.DataFrame): + return + # add in 'target column' for fit df = nw.from_native(df) native_namespace = nw.get_native_namespace(df) @@ -52,9 +60,6 @@ def test_non_numeric_exception_raised( ), ).to_native() - x = initialized_transformers[self.transformer_name] - x.columns = bad_cols - with pytest.raises( TypeError, match=re.escape( @@ -81,6 +86,14 @@ def test_numeric_passes( ): """Test check passes if self.columns numeric in X.""" df = df_generator(library=library) + + x = initialized_transformers[self.transformer_name] + x.columns = cols + + # if transformer is not polars compatible, skip polars test + if not x.polars_compatible and isinstance(df, pl.DataFrame): + return + # add in 'target column' for fit df = nw.from_native(df) native_namespace = nw.get_native_namespace(df) @@ -92,9 +105,6 @@ def test_numeric_passes( ), ).to_native() - x = initialized_transformers[self.transformer_name] - x.columns = cols - x.fit(df, df["c"]) @@ -125,6 +135,14 @@ def test_non_numeric_exception_raised( ): """Test an exception is raised if self.columns are non-numeric in X.""" df = df_generator(library=library) + + x = initialized_transformers[self.transformer_name] + x.columns = bad_cols + + # if transformer is not polars compatible, skip polars test + if not x.polars_compatible and isinstance(df, pl.DataFrame): + return + # add in 'target column' for and additional numeric column fit df = nw.from_native(df) native_namespace = nw.get_native_namespace(df) @@ -136,9 +154,6 @@ def test_non_numeric_exception_raised( ), ).to_native() - x = initialized_transformers[self.transformer_name] - x.columns = bad_cols - # if the transformer fits, run a working fit before transform if x.FITS: # create numeric df to fit on @@ -169,6 +184,14 @@ def test_non_numeric_exception_raised( def test_numeric_passes(self, initialized_transformers, df_generator, library): """Test check passes if self.columns numeric in X.""" df = df_generator(library=library) + + x = initialized_transformers[self.transformer_name] + x.columns = ["a", "b"] + + # if transformer is not polars compatible, skip polars test + if not x.polars_compatible and isinstance(df, pl.DataFrame): + return + # add in 'target column' for and additional numeric column fit df = nw.from_native(df) native_namespace = nw.get_native_namespace(df) @@ -185,9 +208,6 @@ def test_numeric_passes(self, initialized_transformers, df_generator, library): ), ).to_native() - x = initialized_transformers[self.transformer_name] - x.columns = ["a", "b"] - if x.FITS: # create numeric df to fit on df_dict = {col: df["c"] for col in [*x.columns, "c"]} From 0dcad9a2a7f6ea78b314c2d0d60fb4931363731e Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Wed, 18 Dec 2024 11:41:31 +0000 Subject: [PATCH 07/10] converted BaseCappingTransformer to narwhals --- tests/capping/test_BaseCappingTransformer.py | 38 ++++++++++++-- tests/capping/test_CappingTransformer.py | 37 +++++++------- tubular/capping.py | 53 +++++++++++++++++--- 3 files changed, 100 insertions(+), 28 deletions(-) diff --git a/tests/capping/test_BaseCappingTransformer.py b/tests/capping/test_BaseCappingTransformer.py index 56ec28e5..ad5fdf17 100644 --- a/tests/capping/test_BaseCappingTransformer.py +++ b/tests/capping/test_BaseCappingTransformer.py @@ -384,6 +384,8 @@ def test_non_cap_column_left_untouched( df = d.create_df_4(library=library) + expected = self.expected_df_2(library=library) + transformer = initialized_transformers[self.transformer_name] # if transformer is not polars compatible, skip polars test @@ -394,12 +396,31 @@ def test_non_cap_column_left_untouched( df_transformed = transformer.transform(df) - expected = self.expected_df_2(library=library) + # exclude transformed columns for this test + # outcomes on transformed columns are currently tested in the child classes + # CappingTransformer and OutOfRangeNullTransformer + # TODO - open question as to whether we want to try moving some of those tests into + # this file + columns_to_test = [ + col for col in df_transformed.columns if col not in transformer.columns + ] + + assert_frame_equal_dispatch( + df_transformed[columns_to_test], + expected[columns_to_test], + ) - expected = nw.from_native(expected).drop("a").to_native() - df_transformed = nw.from_native(df_transformed).drop("a").to_native() + # Check outcomes for single rows + df = nw.from_native(df) + expected = nw.from_native(expected) + for i in range(len(df)): + df_transformed_row = transformer.transform(df[[i]].to_native()) + df_expected_row = expected[[i]].to_native() - assert_frame_equal_dispatch(df_transformed, expected) + assert_frame_equal_dispatch( + df_transformed_row[columns_to_test], + df_expected_row[columns_to_test], + ) @pytest.mark.parametrize("library", ["pandas", "polars"]) @pytest.mark.parametrize( @@ -434,6 +455,15 @@ def test_learnt_values_not_modified( learnt_values == new_learnt_values ), f"learnt_value {fit_value} changed by transform, expected {learnt_values} but got {new_learnt_values}" + # Check outcomes for single rows + df = nw.from_native(df) + for i in range(len(df)): + transformer.transform(df[[i]].to_native()) + + assert ( + learnt_values == new_learnt_values + ), f"learnt_value {fit_value} changed by transform, expected {learnt_values} but got {new_learnt_values}" + @pytest.mark.parametrize("library", ["pandas", "polars"]) def test_non_numeric_column_error( self, diff --git a/tests/capping/test_CappingTransformer.py b/tests/capping/test_CappingTransformer.py index c087046b..9293c6df 100644 --- a/tests/capping/test_CappingTransformer.py +++ b/tests/capping/test_CappingTransformer.py @@ -1,7 +1,5 @@ -import numpy as np import pandas as pd import pytest -import test_aide as ta import tests.test_data as d from tests.base_tests import OtherBaseBehaviourTests @@ -10,6 +8,7 @@ GenericCappingInitTests, GenericCappingTransformTests, ) +from tests.utils import assert_frame_equal_dispatch class TestInit(GenericCappingInitTests): @@ -79,29 +78,27 @@ class TestTransform(GenericCappingTransformTests): def setup_class(cls): cls.transformer_name = "CappingTransformer" - def expected_df_1(): + def expected_df_1(self): """Expected output from test_expected_output_min_and_max.""" return pd.DataFrame( { - "a": [2, 2, 3, 4, 5, 5, np.nan], - "b": [1, 2, 3, np.nan, 7, 7, 7], - "c": [np.nan, 1, 2, 3, 0, 0, 0], + "a": [2, 2, 3, 4, 5, 5, None], + "b": [1, 2, 3, None, 7, 7, 7], + "c": [None, 1, 2, 3, 0, 0, 0], }, ) - @pytest.mark.parametrize( - ("df", "expected"), - ta.pandas.adjusted_dataframe_params(d.create_df_3(), expected_df_1()), - ) def test_expected_output_min_and_max_combinations( self, - df, - expected, minimal_attribute_dict, uninitialized_transformers, ): """Test that capping is applied correctly in transform.""" + df = d.create_df_3() + print(df) + expected = self.expected_df_1() + args = minimal_attribute_dict[self.transformer_name].copy() args["capping_values"] = {"a": [2, 5], "b": [None, 7], "c": [0, None]} @@ -109,11 +106,17 @@ def test_expected_output_min_and_max_combinations( df_transformed = transformer.transform(df) - ta.equality.assert_frame_equal_msg( - actual=df_transformed, - expected=expected, - msg_tag=f"Unexpected values in {self.transformer_name}.transform", - ) + assert_frame_equal_dispatch(df_transformed, expected) + + # Check outcomes for single rows + for i in range(len(df)): + df_transformed_row = transformer.transform(df.iloc[[i]]) + df_expected_row = expected.iloc[[i]] + + assert_frame_equal_dispatch( + df_transformed_row, + df_expected_row, + ) class TestOtherBaseBehaviour(OtherBaseBehaviourTests): diff --git a/tubular/capping.py b/tubular/capping.py index 8208435c..a4c7b4c1 100644 --- a/tubular/capping.py +++ b/tubular/capping.py @@ -412,9 +412,13 @@ def transform(self, X: FrameT) -> FrameT: if self.quantiles: self.check_is_fitted(["quantile_capping_values"]) + capping_values_for_transform = self.quantile_capping_values + dict_attrs = dict_attrs + ["quantile_capping_values"] else: + capping_values_for_transform = self.capping_values + dict_attrs = dict_attrs + ["capping_values"] for attr_name in dict_attrs: @@ -423,15 +427,37 @@ def transform(self, X: FrameT) -> FrameT: raise ValueError(msg) for col in self.columns: + cap_value_min = capping_values_for_transform[col][0] + cap_value_max = capping_values_for_transform[col][1] + replacement_min = self._replacement_values[col][0] replacement_max = self._replacement_values[col][1] - X = X.with_columns( - nw.col(col).clip( - lower_bound=replacement_min, - upper_bound=replacement_max, - ), - ) + for cap_value, replacement_value, condition in zip( + [cap_value_min, cap_value_max], + [replacement_min, replacement_max], + [nw.col(col) < cap_value_min, nw.col(col) > cap_value_max], + ): + if cap_value is not None: + X = X.with_columns( + nw.when( + condition, + ) + .then( + replacement_value, + ) + .otherwise( + nw.col(col), + ) + # make sure type is preserved for single row, + # e.g. mapping single row to int could convert + # from float to int + # TODO - look into better ways to achieve this + .cast( + X.get_column(col).dtype, + ) + .alias(col), + ) return X @@ -638,11 +664,24 @@ def fit(self, X: pd.DataFrame, y: None = None) -> OutOfRangeNullTransformer: Required for pipeline. """ - super().fit(X=X, y=y) + # be careful to only run BaseCappingTransformer fit for quantiles case + # or we will overwrite our _replacement_values from init + BaseNumericTransformer.fit(self, X=X, y=y) + + if self.weights_column: + WeightColumnMixin.check_weights_column(self, X, self.weights_column) if self.quantiles: + BaseCappingTransformer.fit(self, X=X, y=y) + self._replacement_values = OutOfRangeNullTransformer.set_replacement_values( self.quantile_capping_values, ) + else: + warnings.warn( + f"{self.classname()}: quantiles not set so no fitting done in OutOfRangeNullTransformer", + stacklevel=2, + ) + return self From a2e8d2d07004c9c56eb9cb5294e0b4ba75705cbe Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 6 Jan 2025 13:34:18 +0000 Subject: [PATCH 08/10] reverted capping transformer tests changes --- tests/capping/test_CappingTransformer.py | 37 +++++++++++------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/tests/capping/test_CappingTransformer.py b/tests/capping/test_CappingTransformer.py index 9293c6df..c087046b 100644 --- a/tests/capping/test_CappingTransformer.py +++ b/tests/capping/test_CappingTransformer.py @@ -1,5 +1,7 @@ +import numpy as np import pandas as pd import pytest +import test_aide as ta import tests.test_data as d from tests.base_tests import OtherBaseBehaviourTests @@ -8,7 +10,6 @@ GenericCappingInitTests, GenericCappingTransformTests, ) -from tests.utils import assert_frame_equal_dispatch class TestInit(GenericCappingInitTests): @@ -78,27 +79,29 @@ class TestTransform(GenericCappingTransformTests): def setup_class(cls): cls.transformer_name = "CappingTransformer" - def expected_df_1(self): + def expected_df_1(): """Expected output from test_expected_output_min_and_max.""" return pd.DataFrame( { - "a": [2, 2, 3, 4, 5, 5, None], - "b": [1, 2, 3, None, 7, 7, 7], - "c": [None, 1, 2, 3, 0, 0, 0], + "a": [2, 2, 3, 4, 5, 5, np.nan], + "b": [1, 2, 3, np.nan, 7, 7, 7], + "c": [np.nan, 1, 2, 3, 0, 0, 0], }, ) + @pytest.mark.parametrize( + ("df", "expected"), + ta.pandas.adjusted_dataframe_params(d.create_df_3(), expected_df_1()), + ) def test_expected_output_min_and_max_combinations( self, + df, + expected, minimal_attribute_dict, uninitialized_transformers, ): """Test that capping is applied correctly in transform.""" - df = d.create_df_3() - print(df) - expected = self.expected_df_1() - args = minimal_attribute_dict[self.transformer_name].copy() args["capping_values"] = {"a": [2, 5], "b": [None, 7], "c": [0, None]} @@ -106,17 +109,11 @@ def test_expected_output_min_and_max_combinations( df_transformed = transformer.transform(df) - assert_frame_equal_dispatch(df_transformed, expected) - - # Check outcomes for single rows - for i in range(len(df)): - df_transformed_row = transformer.transform(df.iloc[[i]]) - df_expected_row = expected.iloc[[i]] - - assert_frame_equal_dispatch( - df_transformed_row, - df_expected_row, - ) + ta.equality.assert_frame_equal_msg( + actual=df_transformed, + expected=expected, + msg_tag=f"Unexpected values in {self.transformer_name}.transform", + ) class TestOtherBaseBehaviour(OtherBaseBehaviourTests): From ca2b5ca046b257463e89359f4a1be0933494c174 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 6 Jan 2025 13:35:26 +0000 Subject: [PATCH 09/10] updated capping docstring --- tubular/capping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubular/capping.py b/tubular/capping.py index a4c7b4c1..a5ef3e3b 100644 --- a/tubular/capping.py +++ b/tubular/capping.py @@ -248,7 +248,7 @@ def prepare_quantiles( X : FrameT Dataframe with relevant columns to calculate quantiles from. - quantiles : None + quantiles : list[float] Weighted quantiles to calculate. Must all be between 0 and 1. values_col: str From 45fcbc198ae645df1c30eaca5bc01c48ec7306da Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 6 Jan 2025 13:49:42 +0000 Subject: [PATCH 10/10] edited super.fit handling in OutOfRangeNullTr --- tubular/capping.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tubular/capping.py b/tubular/capping.py index a5ef3e3b..f69600fa 100644 --- a/tubular/capping.py +++ b/tubular/capping.py @@ -664,9 +664,7 @@ def fit(self, X: pd.DataFrame, y: None = None) -> OutOfRangeNullTransformer: Required for pipeline. """ - # be careful to only run BaseCappingTransformer fit for quantiles case - # or we will overwrite our _replacement_values from init - BaseNumericTransformer.fit(self, X=X, y=y) + super().fit(X=X, y=y) if self.weights_column: WeightColumnMixin.check_weights_column(self, X, self.weights_column)