From ebb1cfe17ee78414e48342ed0ae7787244ed9a16 Mon Sep 17 00:00:00 2001 From: Julien PETOT <48205834+jpetot@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:56:35 +0100 Subject: [PATCH 1/3] fix(inverse_transform): make inverse transform work when there is the separator in the variable name --- saiph/conftest.py | 4 ++-- saiph/inverse_transform.py | 8 ++++---- saiph/inverse_transform_test.py | 24 ++++++++++++++++++++++++ saiph/projection.py | 6 +++--- saiph/reduction/__init__.py | 2 +- saiph/reduction/famd.py | 10 +++++----- saiph/reduction/famd_sparse.py | 6 +++--- saiph/reduction/famd_sparse_test.py | 18 +++++++++--------- saiph/reduction/famd_test.py | 20 ++++++++++---------- saiph/reduction/mca.py | 10 +++++----- saiph/reduction/mca_test.py | 12 ++++++------ saiph/reduction/utils/common.py | 4 ++-- saiph/reduction/utils/common_test.py | 4 ++-- 13 files changed, 76 insertions(+), 52 deletions(-) diff --git a/saiph/conftest.py b/saiph/conftest.py index eecda8a..527338b 100644 --- a/saiph/conftest.py +++ b/saiph/conftest.py @@ -5,7 +5,7 @@ import pytest from saiph.models import Model -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR _iris_csv = pd.read_csv("tests/fixtures/iris.csv") _wbcd_csv = pd.read_csv("tests/fixtures/breast_cancer_wisconsin.csv") @@ -127,7 +127,7 @@ def wbcd_supplemental_coord_mixed() -> pd.DataFrame: @pytest.fixture def mapping() -> Dict[str, List[str]]: - sep = DUMMIES_PREFIX_SEP + sep = DUMMIES_SEPARATOR return { "tool": [f"tool{sep}hammer", f"tool{sep}wrench"], "fruit": [f"fruit{sep}apple", f"fruit{sep}orange"], diff --git a/saiph/inverse_transform.py b/saiph/inverse_transform.py index 385dd06..5c5e94b 100644 --- a/saiph/inverse_transform.py +++ b/saiph/inverse_transform.py @@ -8,7 +8,7 @@ from saiph.exception import InvalidParameterException from saiph.models import Model -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR from saiph.reduction.utils.common import get_dummies_mapping @@ -138,8 +138,8 @@ def undummify( """ inverse_quali = pd.DataFrame() - def get_suffix(string: str) -> str: - return string.split(DUMMIES_PREFIX_SEP)[1] + def get_suffix(string: str, original_column: str) -> str: + return string.removeprefix(original_column + DUMMIES_SEPARATOR) for original_column, dummy_columns in dummies_mapping.items(): # Handle a single category with all the possible modalities @@ -149,7 +149,7 @@ def get_suffix(string: str) -> str: chosen_modalities = single_category.idxmax(axis="columns") else: chosen_modalities = get_random_weighted_columns(single_category, random_gen) - inverse_quali[original_column] = list(map(get_suffix, chosen_modalities)) + inverse_quali[original_column] = list(map(lambda x: get_suffix(x, original_column), chosen_modalities)) return inverse_quali diff --git a/saiph/inverse_transform_test.py b/saiph/inverse_transform_test.py index 2aead62..e60cab5 100644 --- a/saiph/inverse_transform_test.py +++ b/saiph/inverse_transform_test.py @@ -13,6 +13,7 @@ undummify, ) from saiph.projection import fit, fit_transform +from saiph.reduction import DUMMIES_SEPARATOR @pytest.mark.parametrize( @@ -78,6 +79,29 @@ def test_undummify( assert_frame_equal(df, expected) +def test_undummify_when_dummies_prefix_is_in_variable_name() -> None: + column_name = f"tool{DUMMIES_SEPARATOR}" + + dummy_df = pd.DataFrame( + [[0.3, 0.7], [0.51, 0.49]], + columns=[f"{column_name}{DUMMIES_SEPARATOR}hammer", f"{column_name}{DUMMIES_SEPARATOR}wrench"], + ) + mapping = { + column_name: [f"{column_name}{DUMMIES_SEPARATOR}hammer", f"{column_name}{DUMMIES_SEPARATOR}wrench"], + } + + df = undummify( + dummy_df, + mapping, + use_max_modalities=True, + ) + + expected = pd.DataFrame( + [["wrench"], ["hammer"]], columns=[f"tool{DUMMIES_SEPARATOR}"] + ) + + assert_frame_equal(df, expected) + # wider than len df def test_inverse_transform_raises_value_error_when_wider_than_df() -> None: diff --git a/saiph/projection.py b/saiph/projection.py index dda2c0a..07d37fc 100644 --- a/saiph/projection.py +++ b/saiph/projection.py @@ -7,7 +7,7 @@ from saiph.exception import InvalidParameterException from saiph.models import Model -from saiph.reduction import DUMMIES_PREFIX_SEP, famd, famd_sparse, mca, pca +from saiph.reduction import DUMMIES_SEPARATOR, famd, famd_sparse, mca, pca from saiph.reduction.utils.common import get_projected_column_names @@ -48,7 +48,7 @@ def fit( f"got {unknown_variables} instead." ) - _nf = nf if nf else min(pd.get_dummies(df, prefix_sep=DUMMIES_PREFIX_SEP).shape) + _nf = nf if nf else min(pd.get_dummies(df, prefix_sep=DUMMIES_SEPARATOR).shape) # If seed is None or int, we fit a Generator, else we use the one provided. random_gen = ( seed if isinstance(seed, np.random.Generator) else np.random.default_rng(seed) @@ -248,7 +248,7 @@ def get_variable_correlation( if has_some_quali: df_quali = pd.get_dummies( df[model.original_categorical].astype("category"), - prefix_sep=DUMMIES_PREFIX_SEP, + prefix_sep=DUMMIES_SEPARATOR, ) bind = pd.concat([df_quanti, df_quali], axis=1) else: diff --git a/saiph/reduction/__init__.py b/saiph/reduction/__init__.py index 8dc41ab..bc46dae 100644 --- a/saiph/reduction/__init__.py +++ b/saiph/reduction/__init__.py @@ -1 +1 @@ -DUMMIES_PREFIX_SEP = "___" +DUMMIES_SEPARATOR = "___" diff --git a/saiph/reduction/famd.py b/saiph/reduction/famd.py index 7ea51f8..7292477 100644 --- a/saiph/reduction/famd.py +++ b/saiph/reduction/famd.py @@ -9,7 +9,7 @@ from numpy.typing import NDArray from saiph.models import Model -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR from saiph.reduction.utils.common import ( column_multiplication, get_dummies_mapping, @@ -57,7 +57,7 @@ def center( # scale the categorical data df_quali = pd.get_dummies( - df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP + df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR ) # .mean() is the same as counting 0s and 1s # This will only work if we stick with pd.get_dummies to encode the modalities @@ -110,7 +110,7 @@ def fit( quanti = df.select_dtypes(include=["int", "float", "number"]).columns.to_list() quali = df.select_dtypes(exclude=["int", "float", "number"]).columns.to_list() dummy_categorical = pd.get_dummies( - df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP + df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR ).columns.to_list() modalities_types = get_modalities_types(df[quali]) @@ -239,7 +239,7 @@ def scaler(model: Model, df: pd.DataFrame) -> pd.DataFrame: # scale df_quali = pd.get_dummies( - df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP + df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR ) # Here we add a column with 0 if the modality is not present in the dataset but # was used to train the saiph model @@ -389,7 +389,7 @@ def compute_categorical_cos2( mapping = get_dummies_mapping(model.original_categorical, model.dummy_categorical) dummy = pd.get_dummies( - df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP + df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR ) # Compute the categorical cos2 for each original column all_category_cos = {} diff --git a/saiph/reduction/famd_sparse.py b/saiph/reduction/famd_sparse.py index 350ac53..d4ab09b 100644 --- a/saiph/reduction/famd_sparse.py +++ b/saiph/reduction/famd_sparse.py @@ -9,7 +9,7 @@ from scipy.sparse import csr_matrix from saiph.models import Model -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR from saiph.reduction.famd import fit as fit_famd from saiph.reduction.famd import transform as transform_famd @@ -102,7 +102,7 @@ def center_sparse( # scale the categorical data df_quali = pd.get_dummies( - df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP + df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR ) _modalities = df_quali.columns df_quali = csr_matrix(df_quali) @@ -130,7 +130,7 @@ def scaler_sparse(model: Model, df: pd.DataFrame) -> pd.DataFrame: # scale df_quali = pd.get_dummies( - df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP + df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR ) if model._modalities is not None: for mod in model._modalities: diff --git a/saiph/reduction/famd_sparse_test.py b/saiph/reduction/famd_sparse_test.py index 355f4f6..d40b11c 100644 --- a/saiph/reduction/famd_sparse_test.py +++ b/saiph/reduction/famd_sparse_test.py @@ -7,7 +7,7 @@ from pandas._testing.asserters import assert_series_equal from pandas.testing import assert_frame_equal -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR from saiph.reduction.famd_sparse import ( center_sparse, fit, @@ -56,10 +56,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None: assert np.array_equal( model._modalities, # type: ignore [ - f"tool{DUMMIES_PREFIX_SEP}hammer", - f"tool{DUMMIES_PREFIX_SEP}toaster", - f"score{DUMMIES_PREFIX_SEP}aa", - f"score{DUMMIES_PREFIX_SEP}ab", + f"tool{DUMMIES_SEPARATOR}hammer", + f"tool{DUMMIES_SEPARATOR}toaster", + f"score{DUMMIES_SEPARATOR}aa", + f"score{DUMMIES_SEPARATOR}ab", ], ) assert model.D_c is None @@ -75,10 +75,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None: assert np.array_equal( model._modalities, # type: ignore [ - f"tool{DUMMIES_PREFIX_SEP}hammer", - f"tool{DUMMIES_PREFIX_SEP}toaster", - f"score{DUMMIES_PREFIX_SEP}aa", - f"score{DUMMIES_PREFIX_SEP}ab", + f"tool{DUMMIES_SEPARATOR}hammer", + f"tool{DUMMIES_SEPARATOR}toaster", + f"score{DUMMIES_SEPARATOR}aa", + f"score{DUMMIES_SEPARATOR}ab", ], ) diff --git a/saiph/reduction/famd_test.py b/saiph/reduction/famd_test.py index 4573f06..70c09a6 100644 --- a/saiph/reduction/famd_test.py +++ b/saiph/reduction/famd_test.py @@ -9,7 +9,7 @@ from pandas._testing.asserters import assert_series_equal from pandas.testing import assert_frame_equal -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR from saiph.reduction.famd import ( center, fit, @@ -59,10 +59,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None: assert np.array_equal( model._modalities, # type: ignore [ - f"tool{DUMMIES_PREFIX_SEP}hammer", - f"tool{DUMMIES_PREFIX_SEP}toaster", - f"score{DUMMIES_PREFIX_SEP}aa", - f"score{DUMMIES_PREFIX_SEP}ab", + f"tool{DUMMIES_SEPARATOR}hammer", + f"tool{DUMMIES_SEPARATOR}toaster", + f"score{DUMMIES_SEPARATOR}aa", + f"score{DUMMIES_SEPARATOR}ab", ], ) # Pertinent ? @@ -81,10 +81,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None: assert np.array_equal( model._modalities, # type: ignore [ - f"tool{DUMMIES_PREFIX_SEP}hammer", - f"tool{DUMMIES_PREFIX_SEP}toaster", - f"score{DUMMIES_PREFIX_SEP}aa", - f"score{DUMMIES_PREFIX_SEP}ab", + f"tool{DUMMIES_SEPARATOR}hammer", + f"tool{DUMMIES_SEPARATOR}toaster", + f"score{DUMMIES_SEPARATOR}aa", + f"score{DUMMIES_SEPARATOR}ab", ], ) @@ -226,7 +226,7 @@ def test_get_variable_contributions_exploded_parameter( contributions_not_exploded, _ = get_variable_contributions(model, df, explode=False) dummies = filter( - lambda name: f"{variable}{DUMMIES_PREFIX_SEP}" in name, + lambda name: f"{variable}{DUMMIES_SEPARATOR}" in name, contributions_exploded.index, ) assert_series_equal( diff --git a/saiph/reduction/mca.py b/saiph/reduction/mca.py index cd7aece..7c0f5d0 100644 --- a/saiph/reduction/mca.py +++ b/saiph/reduction/mca.py @@ -7,7 +7,7 @@ from numpy.typing import NDArray from saiph.models import Model -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR from saiph.reduction.utils.common import ( column_multiplication, diag, @@ -68,7 +68,7 @@ def fit( df_scale, T, D_c = _diag_compute(df_scale, r, c) # Get the array gathering proportion of each modality among individual (N/n) - df_dummies = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP) + df_dummies = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR) dummies_col_prop = (len(df_dummies) / df_dummies.sum(axis=0)).to_numpy() # Apply the weights and compute the svd @@ -156,7 +156,7 @@ def center( row_sum: Sums line by line column_sum: Sums column by column """ - df_scale = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP) + df_scale = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR) _modalities = df_scale.columns.values # scale data @@ -177,7 +177,7 @@ def scaler(model: Model, df: pd.DataFrame) -> pd.DataFrame: Returns: df_scaled: The scaled DataFrame. """ - df_scaled = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP) + df_scaled = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR) if model._modalities is not None: for mod in model._modalities: if mod not in df_scaled: @@ -239,7 +239,7 @@ def get_variable_contributions( raise ValueError( "Model has not been fitted. Call fit() to create a Model instance." ) - df = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP) + df = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR) centered_df = df / df.sum().sum() diff --git a/saiph/reduction/mca_test.py b/saiph/reduction/mca_test.py index 01dc502..d7ba301 100644 --- a/saiph/reduction/mca_test.py +++ b/saiph/reduction/mca_test.py @@ -6,7 +6,7 @@ from numpy.typing import NDArray from pandas.testing import assert_frame_equal, assert_series_equal -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR from saiph.reduction.mca import ( fit, fit_transform, @@ -47,9 +47,9 @@ def test_fit() -> None: assert np.array_equal( model._modalities, # type: ignore [ - f"tool{DUMMIES_PREFIX_SEP}hammer", - f"tool{DUMMIES_PREFIX_SEP}toaster", - f"score{DUMMIES_PREFIX_SEP}aa", + f"tool{DUMMIES_SEPARATOR}hammer", + f"tool{DUMMIES_SEPARATOR}toaster", + f"score{DUMMIES_SEPARATOR}aa", ], ) assert_allclose( @@ -87,7 +87,7 @@ def test_fit_zero() -> None: assert pd.isna(model.explained_var_ratio).all() assert np.array_equal( model._modalities, # type: ignore - [f"tool{DUMMIES_PREFIX_SEP}toaster", f"score{DUMMIES_PREFIX_SEP}aa"], + [f"tool{DUMMIES_SEPARATOR}toaster", f"score{DUMMIES_SEPARATOR}aa"], ) assert_allclose( model.D_c, @@ -211,7 +211,7 @@ def test_get_variable_contributions_exploded_parameter(mixed_df: pd.DataFrame) - contributions_exploded = get_variable_contributions(model, df, explode=True) contributions_not_exploded = get_variable_contributions(model, df, explode=False) dummies = filter( - lambda name: f"{variable}{DUMMIES_PREFIX_SEP}" in name, + lambda name: f"{variable}{DUMMIES_SEPARATOR}" in name, contributions_exploded.index, ) assert_series_equal( diff --git a/saiph/reduction/utils/common.py b/saiph/reduction/utils/common.py index 8c87fe8..ff71832 100644 --- a/saiph/reduction/utils/common.py +++ b/saiph/reduction/utils/common.py @@ -7,7 +7,7 @@ from numpy.typing import NDArray from toolz import concat -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR def get_projected_column_names(n: int) -> List[str]: @@ -73,7 +73,7 @@ def get_dummies_mapping( { col: list( filter( - lambda c: c.startswith(f"{col}{DUMMIES_PREFIX_SEP}"), dummy_columns + lambda c: c.startswith(f"{col}{DUMMIES_SEPARATOR}"), dummy_columns ) ) for col in columns diff --git a/saiph/reduction/utils/common_test.py b/saiph/reduction/utils/common_test.py index 6a5ebc4..ce9070d 100644 --- a/saiph/reduction/utils/common_test.py +++ b/saiph/reduction/utils/common_test.py @@ -6,7 +6,7 @@ from pandas.testing import assert_frame_equal from toolz import concat -from saiph.reduction import DUMMIES_PREFIX_SEP +from saiph.reduction import DUMMIES_SEPARATOR from saiph.reduction.utils.common import ( column_multiplication, get_dummies_mapping, @@ -48,7 +48,7 @@ def test_row_division(df: pd.DataFrame) -> None: def test_get_dummies_mapping( quali_df: pd.DataFrame, mapping: Dict[str, List[str]] ) -> None: - dummy_columns = pd.get_dummies(quali_df, prefix_sep=DUMMIES_PREFIX_SEP).columns + dummy_columns = pd.get_dummies(quali_df, prefix_sep=DUMMIES_SEPARATOR).columns result = get_dummies_mapping(quali_df.columns, dummy_columns) assert result == mapping From 14842c208d14e57842031c45cc49f3e489755c17 Mon Sep 17 00:00:00 2001 From: Julien PETOT <48205834+jpetot@users.noreply.github.com> Date: Thu, 18 Jan 2024 12:06:06 +0100 Subject: [PATCH 2/3] ci(api): apply lci --- saiph/inverse_transform.py | 4 +++- saiph/inverse_transform_test.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/saiph/inverse_transform.py b/saiph/inverse_transform.py index 5c5e94b..325ac4a 100644 --- a/saiph/inverse_transform.py +++ b/saiph/inverse_transform.py @@ -149,7 +149,9 @@ def get_suffix(string: str, original_column: str) -> str: chosen_modalities = single_category.idxmax(axis="columns") else: chosen_modalities = get_random_weighted_columns(single_category, random_gen) - inverse_quali[original_column] = list(map(lambda x: get_suffix(x, original_column), chosen_modalities)) + inverse_quali[original_column] = list( + map(lambda x: get_suffix(x, original_column), chosen_modalities) + ) return inverse_quali diff --git a/saiph/inverse_transform_test.py b/saiph/inverse_transform_test.py index e60cab5..daeaeed 100644 --- a/saiph/inverse_transform_test.py +++ b/saiph/inverse_transform_test.py @@ -79,15 +79,22 @@ def test_undummify( assert_frame_equal(df, expected) + def test_undummify_when_dummies_prefix_is_in_variable_name() -> None: column_name = f"tool{DUMMIES_SEPARATOR}" - + dummy_df = pd.DataFrame( [[0.3, 0.7], [0.51, 0.49]], - columns=[f"{column_name}{DUMMIES_SEPARATOR}hammer", f"{column_name}{DUMMIES_SEPARATOR}wrench"], + columns=[ + f"{column_name}{DUMMIES_SEPARATOR}hammer", + f"{column_name}{DUMMIES_SEPARATOR}wrench", + ], ) mapping = { - column_name: [f"{column_name}{DUMMIES_SEPARATOR}hammer", f"{column_name}{DUMMIES_SEPARATOR}wrench"], + column_name: [ + f"{column_name}{DUMMIES_SEPARATOR}hammer", + f"{column_name}{DUMMIES_SEPARATOR}wrench", + ], } df = undummify( From e69783579e5df0cd89e0c36f19e533caeb5595af Mon Sep 17 00:00:00 2001 From: Julien PETOT <48205834+jpetot@users.noreply.github.com> Date: Thu, 18 Jan 2024 16:09:12 +0100 Subject: [PATCH 3/3] refactor: change suffix function name --- saiph/inverse_transform.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/saiph/inverse_transform.py b/saiph/inverse_transform.py index 325ac4a..264503c 100644 --- a/saiph/inverse_transform.py +++ b/saiph/inverse_transform.py @@ -138,7 +138,7 @@ def undummify( """ inverse_quali = pd.DataFrame() - def get_suffix(string: str, original_column: str) -> str: + def get_modality_from_dummy_variable(string: str, original_column: str) -> str: return string.removeprefix(original_column + DUMMIES_SEPARATOR) for original_column, dummy_columns in dummies_mapping.items(): @@ -150,7 +150,10 @@ def get_suffix(string: str, original_column: str) -> str: else: chosen_modalities = get_random_weighted_columns(single_category, random_gen) inverse_quali[original_column] = list( - map(lambda x: get_suffix(x, original_column), chosen_modalities) + map( + lambda x: get_modality_from_dummy_variable(x, original_column), + chosen_modalities, + ) ) return inverse_quali