Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(inverse_transform): make undummify work when there is the separator in the variable name #111

Merged
merged 3 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions saiph/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest

from saiph.models import Model
from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR

_iris_csv = pd.read_csv("tests/fixtures/iris.csv")
_wbcd_csv = pd.read_csv("tests/fixtures/breast_cancer_wisconsin.csv")
Expand Down Expand Up @@ -127,7 +127,7 @@ def wbcd_supplemental_coord_mixed() -> pd.DataFrame:

@pytest.fixture
def mapping() -> Dict[str, List[str]]:
sep = DUMMIES_PREFIX_SEP
sep = DUMMIES_SEPARATOR
return {
"tool": [f"tool{sep}hammer", f"tool{sep}wrench"],
"fruit": [f"fruit{sep}apple", f"fruit{sep}orange"],
Expand Down
10 changes: 6 additions & 4 deletions saiph/inverse_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from saiph.exception import InvalidParameterException
from saiph.models import Model
from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR
from saiph.reduction.utils.common import get_dummies_mapping


Expand Down Expand Up @@ -138,8 +138,8 @@ def undummify(
"""
inverse_quali = pd.DataFrame()

def get_suffix(string: str) -> str:
return string.split(DUMMIES_PREFIX_SEP)[1]
def get_suffix(string: str, original_column: str) -> str:
return string.removeprefix(original_column + DUMMIES_SEPARATOR)

for original_column, dummy_columns in dummies_mapping.items():
# Handle a single category with all the possible modalities
Expand All @@ -149,7 +149,9 @@ def get_suffix(string: str) -> str:
chosen_modalities = single_category.idxmax(axis="columns")
else:
chosen_modalities = get_random_weighted_columns(single_category, random_gen)
inverse_quali[original_column] = list(map(get_suffix, chosen_modalities))
inverse_quali[original_column] = list(
map(lambda x: get_suffix(x, original_column), chosen_modalities)
)

return inverse_quali

Expand Down
31 changes: 31 additions & 0 deletions saiph/inverse_transform_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
undummify,
)
from saiph.projection import fit, fit_transform
from saiph.reduction import DUMMIES_SEPARATOR


@pytest.mark.parametrize(
Expand Down Expand Up @@ -79,6 +80,36 @@ def test_undummify(
assert_frame_equal(df, expected)


def test_undummify_when_dummies_prefix_is_in_variable_name() -> None:
column_name = f"tool{DUMMIES_SEPARATOR}"

dummy_df = pd.DataFrame(
[[0.3, 0.7], [0.51, 0.49]],
columns=[
f"{column_name}{DUMMIES_SEPARATOR}hammer",
f"{column_name}{DUMMIES_SEPARATOR}wrench",
],
)
mapping = {
column_name: [
f"{column_name}{DUMMIES_SEPARATOR}hammer",
f"{column_name}{DUMMIES_SEPARATOR}wrench",
],
}

df = undummify(
dummy_df,
mapping,
use_max_modalities=True,
)

expected = pd.DataFrame(
[["wrench"], ["hammer"]], columns=[f"tool{DUMMIES_SEPARATOR}"]
)

assert_frame_equal(df, expected)


# wider than len df
def test_inverse_transform_raises_value_error_when_wider_than_df() -> None:
wider_df = pd.DataFrame(
Expand Down
6 changes: 3 additions & 3 deletions saiph/projection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from saiph.exception import InvalidParameterException
from saiph.models import Model
from saiph.reduction import DUMMIES_PREFIX_SEP, famd, famd_sparse, mca, pca
from saiph.reduction import DUMMIES_SEPARATOR, famd, famd_sparse, mca, pca
from saiph.reduction.utils.common import get_projected_column_names


Expand Down Expand Up @@ -48,7 +48,7 @@ def fit(
f"got {unknown_variables} instead."
)

_nf = nf if nf else min(pd.get_dummies(df, prefix_sep=DUMMIES_PREFIX_SEP).shape)
_nf = nf if nf else min(pd.get_dummies(df, prefix_sep=DUMMIES_SEPARATOR).shape)
# If seed is None or int, we fit a Generator, else we use the one provided.
random_gen = (
seed if isinstance(seed, np.random.Generator) else np.random.default_rng(seed)
Expand Down Expand Up @@ -248,7 +248,7 @@ def get_variable_correlation(
if has_some_quali:
df_quali = pd.get_dummies(
df[model.original_categorical].astype("category"),
prefix_sep=DUMMIES_PREFIX_SEP,
prefix_sep=DUMMIES_SEPARATOR,
)
bind = pd.concat([df_quanti, df_quali], axis=1)
else:
Expand Down
2 changes: 1 addition & 1 deletion saiph/reduction/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
DUMMIES_PREFIX_SEP = "___"
DUMMIES_SEPARATOR = "___"
10 changes: 5 additions & 5 deletions saiph/reduction/famd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from numpy.typing import NDArray

from saiph.models import Model
from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR
from saiph.reduction.utils.common import (
column_multiplication,
get_dummies_mapping,
Expand Down Expand Up @@ -57,7 +57,7 @@ def center(

# scale the categorical data
df_quali = pd.get_dummies(
df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR
)
# .mean() is the same as counting 0s and 1s
# This will only work if we stick with pd.get_dummies to encode the modalities
Expand Down Expand Up @@ -110,7 +110,7 @@ def fit(
quanti = df.select_dtypes(include=["int", "float", "number"]).columns.to_list()
quali = df.select_dtypes(exclude=["int", "float", "number"]).columns.to_list()
dummy_categorical = pd.get_dummies(
df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR
).columns.to_list()
modalities_types = get_modalities_types(df[quali])

Expand Down Expand Up @@ -239,7 +239,7 @@ def scaler(model: Model, df: pd.DataFrame) -> pd.DataFrame:

# scale
df_quali = pd.get_dummies(
df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR
)
# Here we add a column with 0 if the modality is not present in the dataset but
# was used to train the saiph model
Expand Down Expand Up @@ -389,7 +389,7 @@ def compute_categorical_cos2(

mapping = get_dummies_mapping(model.original_categorical, model.dummy_categorical)
dummy = pd.get_dummies(
df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR
)
# Compute the categorical cos2 for each original column
all_category_cos = {}
Expand Down
6 changes: 3 additions & 3 deletions saiph/reduction/famd_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from scipy.sparse import csr_matrix

from saiph.models import Model
from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR
from saiph.reduction.famd import fit as fit_famd
from saiph.reduction.famd import transform as transform_famd

Expand Down Expand Up @@ -102,7 +102,7 @@ def center_sparse(

# scale the categorical data
df_quali = pd.get_dummies(
df[quali].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
df[quali].astype("category"), prefix_sep=DUMMIES_SEPARATOR
)
_modalities = df_quali.columns
df_quali = csr_matrix(df_quali)
Expand Down Expand Up @@ -130,7 +130,7 @@ def scaler_sparse(model: Model, df: pd.DataFrame) -> pd.DataFrame:

# scale
df_quali = pd.get_dummies(
df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_PREFIX_SEP
df[model.original_categorical].astype("category"), prefix_sep=DUMMIES_SEPARATOR
)
if model._modalities is not None:
for mod in model._modalities:
Expand Down
18 changes: 9 additions & 9 deletions saiph/reduction/famd_sparse_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pandas._testing.asserters import assert_series_equal
from pandas.testing import assert_frame_equal

from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR
from saiph.reduction.famd_sparse import (
center_sparse,
fit,
Expand Down Expand Up @@ -56,10 +56,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None:
assert np.array_equal(
model._modalities, # type: ignore
[
f"tool{DUMMIES_PREFIX_SEP}hammer",
f"tool{DUMMIES_PREFIX_SEP}toaster",
f"score{DUMMIES_PREFIX_SEP}aa",
f"score{DUMMIES_PREFIX_SEP}ab",
f"tool{DUMMIES_SEPARATOR}hammer",
f"tool{DUMMIES_SEPARATOR}toaster",
f"score{DUMMIES_SEPARATOR}aa",
f"score{DUMMIES_SEPARATOR}ab",
],
)
assert model.D_c is None
Expand All @@ -75,10 +75,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None:
assert np.array_equal(
model._modalities, # type: ignore
[
f"tool{DUMMIES_PREFIX_SEP}hammer",
f"tool{DUMMIES_PREFIX_SEP}toaster",
f"score{DUMMIES_PREFIX_SEP}aa",
f"score{DUMMIES_PREFIX_SEP}ab",
f"tool{DUMMIES_SEPARATOR}hammer",
f"tool{DUMMIES_SEPARATOR}toaster",
f"score{DUMMIES_SEPARATOR}aa",
f"score{DUMMIES_SEPARATOR}ab",
],
)

Expand Down
20 changes: 10 additions & 10 deletions saiph/reduction/famd_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pandas._testing.asserters import assert_series_equal
from pandas.testing import assert_frame_equal

from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR
from saiph.reduction.famd import (
center,
fit,
Expand Down Expand Up @@ -59,10 +59,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None:
assert np.array_equal(
model._modalities, # type: ignore
[
f"tool{DUMMIES_PREFIX_SEP}hammer",
f"tool{DUMMIES_PREFIX_SEP}toaster",
f"score{DUMMIES_PREFIX_SEP}aa",
f"score{DUMMIES_PREFIX_SEP}ab",
f"tool{DUMMIES_SEPARATOR}hammer",
f"tool{DUMMIES_SEPARATOR}toaster",
f"score{DUMMIES_SEPARATOR}aa",
f"score{DUMMIES_SEPARATOR}ab",
],
)
# Pertinent ?
Expand All @@ -81,10 +81,10 @@ def test_fit_mix(mixed_df2: pd.DataFrame) -> None:
assert np.array_equal(
model._modalities, # type: ignore
[
f"tool{DUMMIES_PREFIX_SEP}hammer",
f"tool{DUMMIES_PREFIX_SEP}toaster",
f"score{DUMMIES_PREFIX_SEP}aa",
f"score{DUMMIES_PREFIX_SEP}ab",
f"tool{DUMMIES_SEPARATOR}hammer",
f"tool{DUMMIES_SEPARATOR}toaster",
f"score{DUMMIES_SEPARATOR}aa",
f"score{DUMMIES_SEPARATOR}ab",
],
)

Expand Down Expand Up @@ -226,7 +226,7 @@ def test_get_variable_contributions_exploded_parameter(
contributions_not_exploded, _ = get_variable_contributions(model, df, explode=False)

dummies = filter(
lambda name: f"{variable}{DUMMIES_PREFIX_SEP}" in name,
lambda name: f"{variable}{DUMMIES_SEPARATOR}" in name,
contributions_exploded.index,
)
assert_series_equal(
Expand Down
10 changes: 5 additions & 5 deletions saiph/reduction/mca.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from numpy.typing import NDArray

from saiph.models import Model
from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR
from saiph.reduction.utils.common import (
column_multiplication,
diag,
Expand Down Expand Up @@ -68,7 +68,7 @@ def fit(
df_scale, T, D_c = _diag_compute(df_scale, r, c)

# Get the array gathering proportion of each modality among individual (N/n)
df_dummies = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP)
df_dummies = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR)
dummies_col_prop = (len(df_dummies) / df_dummies.sum(axis=0)).to_numpy()

# Apply the weights and compute the svd
Expand Down Expand Up @@ -156,7 +156,7 @@ def center(
row_sum: Sums line by line
column_sum: Sums column by column
"""
df_scale = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP)
df_scale = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR)
_modalities = df_scale.columns.values

# scale data
Expand All @@ -177,7 +177,7 @@ def scaler(model: Model, df: pd.DataFrame) -> pd.DataFrame:
Returns:
df_scaled: The scaled DataFrame.
"""
df_scaled = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP)
df_scaled = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR)
if model._modalities is not None:
for mod in model._modalities:
if mod not in df_scaled:
Expand Down Expand Up @@ -239,7 +239,7 @@ def get_variable_contributions(
raise ValueError(
"Model has not been fitted. Call fit() to create a Model instance."
)
df = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_PREFIX_SEP)
df = pd.get_dummies(df.astype("category"), prefix_sep=DUMMIES_SEPARATOR)

centered_df = df / df.sum().sum()

Expand Down
12 changes: 6 additions & 6 deletions saiph/reduction/mca_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from numpy.typing import NDArray
from pandas.testing import assert_frame_equal, assert_series_equal

from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR
from saiph.reduction.mca import (
fit,
fit_transform,
Expand Down Expand Up @@ -47,9 +47,9 @@ def test_fit() -> None:
assert np.array_equal(
model._modalities, # type: ignore
[
f"tool{DUMMIES_PREFIX_SEP}hammer",
f"tool{DUMMIES_PREFIX_SEP}toaster",
f"score{DUMMIES_PREFIX_SEP}aa",
f"tool{DUMMIES_SEPARATOR}hammer",
f"tool{DUMMIES_SEPARATOR}toaster",
f"score{DUMMIES_SEPARATOR}aa",
],
)
assert_allclose(
Expand Down Expand Up @@ -87,7 +87,7 @@ def test_fit_zero() -> None:
assert pd.isna(model.explained_var_ratio).all()
assert np.array_equal(
model._modalities, # type: ignore
[f"tool{DUMMIES_PREFIX_SEP}toaster", f"score{DUMMIES_PREFIX_SEP}aa"],
[f"tool{DUMMIES_SEPARATOR}toaster", f"score{DUMMIES_SEPARATOR}aa"],
)
assert_allclose(
model.D_c,
Expand Down Expand Up @@ -211,7 +211,7 @@ def test_get_variable_contributions_exploded_parameter(mixed_df: pd.DataFrame) -
contributions_exploded = get_variable_contributions(model, df, explode=True)
contributions_not_exploded = get_variable_contributions(model, df, explode=False)
dummies = filter(
lambda name: f"{variable}{DUMMIES_PREFIX_SEP}" in name,
lambda name: f"{variable}{DUMMIES_SEPARATOR}" in name,
contributions_exploded.index,
)
assert_series_equal(
Expand Down
4 changes: 2 additions & 2 deletions saiph/reduction/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from numpy.typing import NDArray
from toolz import concat

from saiph.reduction import DUMMIES_PREFIX_SEP
from saiph.reduction import DUMMIES_SEPARATOR


def get_projected_column_names(n: int) -> List[str]:
Expand Down Expand Up @@ -73,7 +73,7 @@ def get_dummies_mapping(
{
col: list(
filter(
lambda c: c.startswith(f"{col}{DUMMIES_PREFIX_SEP}"), dummy_columns
lambda c: c.startswith(f"{col}{DUMMIES_SEPARATOR}"), dummy_columns
)
)
for col in columns
Expand Down
Loading
Loading