octopize · jpetot · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/saiph/conftest.py b/saiph/conftest.py
@@ -5,7 +5,7 @@
 import pytest
 
 from saiph.models import Model
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 
 _iris_csv = pd.read_csv("tests/fixtures/iris.csv")
 _wbcd_csv = pd.read_csv("tests/fixtures/breast_cancer_wisconsin.csv")
@@ -127,7 +127,7 @@ def wbcd_supplemental_coord_mixed() -> pd.DataFrame:
 
 @pytest.fixture
 def mapping() -> Dict[str, List[str]]:
-    sep = DUMMIES_PREFIX_SEP
+    sep = DUMMIES_SEPARATOR
     return {
         "tool": [f"tool{sep}hammer", f"tool{sep}wrench"],
         "fruit": [f"fruit{sep}apple", f"fruit{sep}orange"],

diff --git a/saiph/inverse_transform.py b/saiph/inverse_transform.py
@@ -8,7 +8,7 @@
 
 from saiph.exception import InvalidParameterException
 from saiph.models import Model
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 from saiph.reduction.utils.common import get_dummies_mapping
 
 
@@ -138,8 +138,8 @@ def undummify(
     """
     inverse_quali = pd.DataFrame()
 
-    def get_suffix(string: str) -> str:
-        return string.split(DUMMIES_PREFIX_SEP)[1]
+    def get_suffix(string: str, original_column: str) -> str:
+        return string.removeprefix(original_column + DUMMIES_SEPARATOR)
 
     for original_column, dummy_columns in dummies_mapping.items():
         # Handle a single category with all the possible modalities
@@ -149,7 +149,9 @@ def get_suffix(string: str) -> str:
             chosen_modalities = single_category.idxmax(axis="columns")
         else:
             chosen_modalities = get_random_weighted_columns(single_category, random_gen)
-        inverse_quali[original_column] = list(map(get_suffix, chosen_modalities))
+        inverse_quali[original_column] = list(
+            map(lambda x: get_suffix(x, original_column), chosen_modalities)
+        )
 
     return inverse_quali
 

diff --git a/saiph/inverse_transform_test.py b/saiph/inverse_transform_test.py
@@ -13,6 +13,7 @@
     undummify,
 )
 from saiph.projection import fit, fit_transform
+from saiph.reduction import DUMMIES_SEPARATOR
 
 
 @pytest.mark.parametrize(
@@ -79,6 +80,36 @@ def test_undummify(
     assert_frame_equal(df, expected)
 
 
+def test_undummify_when_dummies_prefix_is_in_variable_name() -> None:
+    column_name = f"tool{DUMMIES_SEPARATOR}"
+
+    dummy_df = pd.DataFrame(
+        [[0.3, 0.7], [0.51, 0.49]],
+        columns=[
+            f"{column_name}{DUMMIES_SEPARATOR}hammer",
+            f"{column_name}{DUMMIES_SEPARATOR}wrench",
+        ],
+    )
+    mapping = {
+        column_name: [
+            f"{column_name}{DUMMIES_SEPARATOR}hammer",
+            f"{column_name}{DUMMIES_SEPARATOR}wrench",
+        ],
+    }
+
+    df = undummify(
+        dummy_df,
+        mapping,
+        use_max_modalities=True,
+    )
+
+    expected = pd.DataFrame(
+        [["wrench"], ["hammer"]], columns=[f"tool{DUMMIES_SEPARATOR}"]
+    )
+
+    assert_frame_equal(df, expected)
+
+
 # wider than len df
 def test_inverse_transform_raises_value_error_when_wider_than_df() -> None:
     wider_df = pd.DataFrame(

diff --git a/saiph/projection.py b/saiph/projection.py
@@ -7,7 +7,7 @@
 
 from saiph.exception import InvalidParameterException
 from saiph.models import Model
-from saiph.reduction import DUMMIES_PREFIX_SEP, famd, famd_sparse, mca, pca
+from saiph.reduction import DUMMIES_SEPARATOR, famd, famd_sparse, mca, pca
 from saiph.reduction.utils.common import get_projected_column_names
 
 
@@ -48,7 +48,7 @@ def fit(
                 f"got {unknown_variables} instead."
             )
 
-    _nf = nf if nf else min(pd.get_dummies(df, prefix_sep=DUMMIES_PREFIX_SEP).shape)
+    _nf = nf if nf else min(pd.get_dummies(df, prefix_sep=DUMMIES_SEPARATOR).shape)
     # If seed is None or int, we fit a Generator, else we use the one provided.
     random_gen = (
         seed if isinstance(seed, np.random.Generator) else np.random.default_rng(seed)
@@ -248,7 +248,7 @@ def get_variable_correlation(
     if has_some_quali:
         df_quali = pd.get_dummies(
             df[model.original_categorical].astype("category"),
-            prefix_sep=DUMMIES_PREFIX_SEP,
+            prefix_sep=DUMMIES_SEPARATOR,
         )
         bind = pd.concat([df_quanti, df_quali], axis=1)
     else:

diff --git a/saiph/reduction/__init__.py b/saiph/reduction/__init__.py
@@ -1 +1 @@
-DUMMIES_PREFIX_SEP = "___"
+DUMMIES_SEPARATOR = "___"
diff --git a/saiph/reduction/famd.py b/saiph/reduction/famd.py
@@ -9,7 +9,7 @@
 from numpy.typing import NDArray
 
 from saiph.models import Model
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 from saiph.reduction.utils.common import (
     column_multiplication,
     get_dummies_mapping,
@@ -57,7 +57,7 @@ def center(
 
     # scale the categorical data
     df_quali = pd.get_dummies(
-        df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
+        df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR
     )
     # .mean() is the same as counting 0s and 1s
     # This will only work if we stick with pd.get_dummies to encode the modalities
@@ -110,7 +110,7 @@ def fit(
     quanti = df.select_dtypes(include=["int", "float", "number"]).columns.to_list()
     quali = df.select_dtypes(exclude=["int", "float", "number"]).columns.to_list()
     dummy_categorical = pd.get_dummies(
-        df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
+        df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR
     ).columns.to_list()
     modalities_types = get_modalities_types(df[quali])
 
@@ -239,7 +239,7 @@ def scaler(model: Model, df: pd.DataFrame) -> pd.DataFrame:
 
     # scale
     df_quali = pd.get_dummies(
-        df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
+        df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR
     )
     # Here we add a column with 0 if the modality is not present in the dataset but
     # was used to train the saiph model
@@ -389,7 +389,7 @@ def compute_categorical_cos2(
 
     mapping = get_dummies_mapping(model.original_categorical, model.dummy_categorical)
     dummy = pd.get_dummies(
-        df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
+        df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR
     )
     # Compute the categorical cos2 for each original column
     all_category_cos = {}

diff --git a/saiph/reduction/famd_sparse.py b/saiph/reduction/famd_sparse.py
@@ -9,7 +9,7 @@
 from scipy.sparse import csr_matrix
 
 from saiph.models import Model
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 from saiph.reduction.famd import fit as fit_famd
 from saiph.reduction.famd import transform as transform_famd
 
@@ -102,7 +102,7 @@ def center_sparse(
 
     # scale the categorical data
     df_quali = pd.get_dummies(
-        df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
+        df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR
     )
     _modalities = df_quali.columns
     df_quali = csr_matrix(df_quali)
@@ -130,7 +130,7 @@ def scaler_sparse(model: Model, df: pd.DataFrame) -> pd.DataFrame:
 
     # scale
     df_quali = pd.get_dummies(
-        df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
+        df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR
     )
     if model._modalities is not None:
         for mod in model._modalities:

diff --git a/saiph/reduction/famd_sparse_test.py b/saiph/reduction/famd_sparse_test.py
@@ -7,7 +7,7 @@
 from pandas._testing.asserters import assert_series_equal
 from pandas.testing import assert_frame_equal
 
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 from saiph.reduction.famd_sparse import (
     center_sparse,
     fit,
@@ -56,10 +56,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None:
     assert np.array_equal(
         model._modalities,  # type: ignore
         [
-            f"tool{DUMMIES_PREFIX_SEP}hammer",
-            f"tool{DUMMIES_PREFIX_SEP}toaster",
-            f"score{DUMMIES_PREFIX_SEP}aa",
-            f"score{DUMMIES_PREFIX_SEP}ab",
+            f"tool{DUMMIES_SEPARATOR}hammer",
+            f"tool{DUMMIES_SEPARATOR}toaster",
+            f"score{DUMMIES_SEPARATOR}aa",
+            f"score{DUMMIES_SEPARATOR}ab",
         ],
     )
     assert model.D_c is None
@@ -75,10 +75,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None:
     assert np.array_equal(
         model._modalities,  # type: ignore
         [
-            f"tool{DUMMIES_PREFIX_SEP}hammer",
-            f"tool{DUMMIES_PREFIX_SEP}toaster",
-            f"score{DUMMIES_PREFIX_SEP}aa",
-            f"score{DUMMIES_PREFIX_SEP}ab",
+            f"tool{DUMMIES_SEPARATOR}hammer",
+            f"tool{DUMMIES_SEPARATOR}toaster",
+            f"score{DUMMIES_SEPARATOR}aa",
+            f"score{DUMMIES_SEPARATOR}ab",
         ],
     )
 

diff --git a/saiph/reduction/famd_test.py b/saiph/reduction/famd_test.py
@@ -9,7 +9,7 @@
 from pandas._testing.asserters import assert_series_equal
 from pandas.testing import assert_frame_equal
 
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 from saiph.reduction.famd import (
     center,
     fit,
@@ -59,10 +59,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None:
     assert np.array_equal(
         model._modalities,  # type: ignore
         [
-            f"tool{DUMMIES_PREFIX_SEP}hammer",
-            f"tool{DUMMIES_PREFIX_SEP}toaster",
-            f"score{DUMMIES_PREFIX_SEP}aa",
-            f"score{DUMMIES_PREFIX_SEP}ab",
+            f"tool{DUMMIES_SEPARATOR}hammer",
+            f"tool{DUMMIES_SEPARATOR}toaster",
+            f"score{DUMMIES_SEPARATOR}aa",
+            f"score{DUMMIES_SEPARATOR}ab",
         ],
     )
     # Pertinent ?
@@ -81,10 +81,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None:
     assert np.array_equal(
         model._modalities,  # type: ignore
         [
-            f"tool{DUMMIES_PREFIX_SEP}hammer",
-            f"tool{DUMMIES_PREFIX_SEP}toaster",
-            f"score{DUMMIES_PREFIX_SEP}aa",
-            f"score{DUMMIES_PREFIX_SEP}ab",
+            f"tool{DUMMIES_SEPARATOR}hammer",
+            f"tool{DUMMIES_SEPARATOR}toaster",
+            f"score{DUMMIES_SEPARATOR}aa",
+            f"score{DUMMIES_SEPARATOR}ab",
         ],
     )
 
@@ -226,7 +226,7 @@ def test_get_variable_contributions_exploded_parameter(
     contributions_not_exploded, _ = get_variable_contributions(model, df, explode=False)
 
     dummies = filter(
-        lambda name: f"{variable}{DUMMIES_PREFIX_SEP}" in name,
+        lambda name: f"{variable}{DUMMIES_SEPARATOR}" in name,
         contributions_exploded.index,
     )
     assert_series_equal(

diff --git a/saiph/reduction/mca.py b/saiph/reduction/mca.py
@@ -7,7 +7,7 @@
 from numpy.typing import NDArray
 
 from saiph.models import Model
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 from saiph.reduction.utils.common import (
     column_multiplication,
     diag,
@@ -68,7 +68,7 @@ def fit(
     df_scale, T, D_c = _diag_compute(df_scale, r, c)
 
     # Get the array gathering proportion of each modality among individual (N/n)
-    df_dummies = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP)
+    df_dummies = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR)
     dummies_col_prop = (len(df_dummies) / df_dummies.sum(axis=0)).to_numpy()
 
     # Apply the weights and compute the svd
@@ -156,7 +156,7 @@ def center(
         row_sum: Sums line by line
         column_sum: Sums column by column
     """
-    df_scale = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP)
+    df_scale = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR)
     _modalities = df_scale.columns.values
 
     # scale data
@@ -177,7 +177,7 @@ def scaler(model: Model, df: pd.DataFrame) -> pd.DataFrame:
     Returns:
         df_scaled: The scaled DataFrame.
     """
-    df_scaled = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP)
+    df_scaled = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR)
     if model._modalities is not None:
         for mod in model._modalities:
             if mod not in df_scaled:
@@ -239,7 +239,7 @@ def get_variable_contributions(
         raise ValueError(
             "Model has not been fitted. Call fit() to create a Model instance."
         )
-    df = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP)
+    df = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR)
 
     centered_df = df / df.sum().sum()
 

diff --git a/saiph/reduction/mca_test.py b/saiph/reduction/mca_test.py
@@ -6,7 +6,7 @@
 from numpy.typing import NDArray
 from pandas.testing import assert_frame_equal, assert_series_equal
 
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 from saiph.reduction.mca import (
     fit,
     fit_transform,
@@ -47,9 +47,9 @@ def test_fit() -> None:
     assert np.array_equal(
         model._modalities,  # type: ignore
         [
-            f"tool{DUMMIES_PREFIX_SEP}hammer",
-            f"tool{DUMMIES_PREFIX_SEP}toaster",
-            f"score{DUMMIES_PREFIX_SEP}aa",
+            f"tool{DUMMIES_SEPARATOR}hammer",
+            f"tool{DUMMIES_SEPARATOR}toaster",
+            f"score{DUMMIES_SEPARATOR}aa",
         ],
     )
     assert_allclose(
@@ -87,7 +87,7 @@ def test_fit_zero() -> None:
     assert pd.isna(model.explained_var_ratio).all()
     assert np.array_equal(
         model._modalities,  # type: ignore
-        [f"tool{DUMMIES_PREFIX_SEP}toaster", f"score{DUMMIES_PREFIX_SEP}aa"],
+        [f"tool{DUMMIES_SEPARATOR}toaster", f"score{DUMMIES_SEPARATOR}aa"],
     )
     assert_allclose(
         model.D_c,
@@ -211,7 +211,7 @@ def test_get_variable_contributions_exploded_parameter(mixed_df: pd.DataFrame) -
     contributions_exploded = get_variable_contributions(model, df, explode=True)
     contributions_not_exploded = get_variable_contributions(model, df, explode=False)
     dummies = filter(
-        lambda name: f"{variable}{DUMMIES_PREFIX_SEP}" in name,
+        lambda name: f"{variable}{DUMMIES_SEPARATOR}" in name,
         contributions_exploded.index,
     )
     assert_series_equal(

diff --git a/saiph/reduction/utils/common.py b/saiph/reduction/utils/common.py
@@ -7,7 +7,7 @@
 from numpy.typing import NDArray
 from toolz import concat
 
-from saiph.reduction import DUMMIES_PREFIX_SEP
+from saiph.reduction import DUMMIES_SEPARATOR
 
 
 def get_projected_column_names(n: int) -> List[str]:
@@ -73,7 +73,7 @@ def get_dummies_mapping(
         {
             col: list(
                 filter(
-                    lambda c: c.startswith(f"{col}{DUMMIES_PREFIX_SEP}"), dummy_columns
+                    lambda c: c.startswith(f"{col}{DUMMIES_SEPARATOR}"), dummy_columns
                 )
             )
             for col in columns
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		DUMMIES_PREFIX_SEP = "___"
		DUMMIES_SEPARATOR = "___"