Skip to content

Commit

Permalink
feat(inverse_transform): enable fit and transform with horizontal_mat…
Browse files Browse the repository at this point in the history
…rix (#139)

* feat(inverse_transform): enable fit and transform with horizontal_matrix

* feat(inverse_transform): add docstring and feedbacks
  • Loading branch information
raimbaultL authored Oct 16, 2024
1 parent fa56976 commit 844e95e
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 39 deletions.
13 changes: 0 additions & 13 deletions saiph/inverse_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import pandas as pd
from numpy.typing import NDArray

from saiph.exception import InvalidParameterException
from saiph.models import Model
from saiph.reduction import DUMMIES_SEPARATOR
from saiph.reduction.utils.common import get_dummies_mapping
Expand All @@ -16,16 +15,13 @@ def inverse_transform(
coord: pd.DataFrame,
model: Model,
*,
use_approximate_inverse: bool = False,
use_max_modalities: bool = True,
) -> pd.DataFrame:
"""Return original format dataframe from coordinates.
Parameters:
coord: coord of individuals to reverse transform
model: model used for projection
use_approximate_inverse: matrix is not invertible when n_individuals < n_dimensions
an approximation with bias can be done by setting to ``True``. default: ``False``
use_max_modalities: for each variable, it assigns to the individual
the modality with the highest proportion (True)
or a random modality weighted by their proportion (False). default: True
Expand All @@ -36,16 +32,7 @@ def inverse_transform(
"""
random_gen = np.random.default_rng(model.seed)

# Check dimension size regarding N
n_dimensions = len(model.dummy_categorical) + len(model.original_continuous)
n_records = len(coord)

if not use_approximate_inverse and n_records < n_dimensions:
raise InvalidParameterException(
f"n_dimensions ({n_dimensions}) is greater than n_records ({n_records})."
)
# Get back scaled_values from coord with inverse matrix operation
# If n_records < n_dimensions, There will be an approximation of the inverse of V.T
scaled_values = pd.DataFrame(coord @ np.linalg.pinv(model.V.T))
# get number of continuous variables
nb_quanti = len(model.original_continuous)
Expand Down
22 changes: 2 additions & 20 deletions saiph/inverse_transform_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from numpy.testing import assert_allclose
from pandas.testing import assert_frame_equal, assert_series_equal

from saiph.exception import InvalidParameterException
from saiph.inverse_transform import (
get_random_weighted_columns,
inverse_transform,
Expand Down Expand Up @@ -110,19 +109,6 @@ def test_undummify_when_dummies_prefix_is_in_variable_name() -> None:
assert_frame_equal(df, expected)


# wider than len df
def test_inverse_transform_raises_value_error_when_wider_than_df() -> None:
wider_df = pd.DataFrame(
{
"variable_1": ["a", "b", "c"],
"variable_2": ["ZZ", "ZZ", "WW"],
}
)
coord, model = fit_transform(wider_df)
with pytest.raises(InvalidParameterException, match=r"n_dimensions"):
inverse_transform(coord, model)


# using df with more dimensions than individuals and high column weights
# allows for a more balanced probability in modality assignment during inverse transform

Expand All @@ -144,9 +130,7 @@ def test_inverse_transform_with_ponderation() -> None:
"cont2": 1,
}
coord, model = fit_transform(df, col_weights=col_weights, seed=5)
result = inverse_transform(
coord, model, use_approximate_inverse=True, use_max_modalities=False
)
result = inverse_transform(coord, model, use_max_modalities=False)
assert_frame_equal(result, inverse_expected)


Expand All @@ -167,9 +151,7 @@ def test_inverse_transform_deterministic() -> None:
"cont2": 1,
}
coord, model = fit_transform(df, col_weights=col_weights)
result = inverse_transform(
coord, model, use_approximate_inverse=True, use_max_modalities=True
)
result = inverse_transform(coord, model, use_max_modalities=True)
assert_frame_equal(result, inverse_expected)


Expand Down
27 changes: 27 additions & 0 deletions saiph/projection_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,3 +540,30 @@ def test_transform_raise_error_on_wrong_columns(
model = fit(df_to_fit)
with pytest.raises(ColumnsNotFoundError):
transform(df_to_transform, model)


@pytest.mark.parametrize(
"df_to_fit_transform",
[
pd.DataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9], "d": [10, 11, 12]}
), # pca
pd.DataFrame(
{"a": ["a", "b", "b"], "b": [1, 3, 6], "c": [1, 2, 3], "d": [1, 2, 3]}
), # famd
pd.DataFrame(
{
"a": ["a", "b", "b"],
"b": ["a", "b", "b"],
"c": ["a", "b", "b"],
"d": ["a", "b", "b"],
}
), # mca
],
)
def test_fit_transform_with_horizontal_matrix(
df_to_fit_transform: pd.DataFrame,
) -> None:
"""Verify that the coordinates are a squared matrix even if the input is horizontal."""
coord, __ = fit_transform(df_to_fit_transform)
assert coord.shape[0] == coord.shape[1]
6 changes: 4 additions & 2 deletions saiph/reduction/famd.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ def fit_transform(
(more weight = more importance in the axes). default: np.ones(df.shape[1])
Returns:
coord: The transformed data.
coord: The transformed data of size (n, min(n,p))
or (n, nf) if nf is specified.
model: The model for transforming new data.
"""
# If seed is None or int, we fit a Generator, else we use the one provided.
Expand Down Expand Up @@ -285,7 +286,8 @@ def transform(
model: Model computed by fit.
Returns:
coord: Coordinates of the dataframe in the fitted space.
coord: Coordinates of the dataframe in the fitted space of size (n, min(n,p))
or (n, nf) if nf is specified.
"""
df_scaled = scaler(model, df)
coord = pd.DataFrame(df_scaled @ model.V.T)
Expand Down
6 changes: 4 additions & 2 deletions saiph/reduction/mca.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ def fit_transform(
Returns:
model: The model for transforming new data.
coord: The transformed data.
coord: The transformed data of size (n, min(n,p))
or (n, nf) if nf is specified.
"""
random_gen = (
seed if isinstance(seed, np.random.Generator) else np.random.default_rng(seed)
Expand Down Expand Up @@ -226,7 +227,8 @@ def transform(df: pd.DataFrame, model: Model) -> pd.DataFrame:
model: Model computed by fit.
Returns:
coord: Coordinates of the dataframe in the fitted space.
coord: Coordinates of the dataframe in the fitted space of size (n, min(n,p))
or (n, nf) if nf is specified.
"""
df_scaled = scaler(model, df)
coord = df_scaled @ model.D_c @ model.V.T
Expand Down
6 changes: 4 additions & 2 deletions saiph/reduction/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ def fit_transform(
Returns:
model: The model for transforming new data.
coord: The transformed data.
coord: The transformed data of size (n, min(n,p))
or (n, nf) if nf is specified.
"""
model = fit(df, nf, col_weights, seed=seed)
coord = transform(df, model)
Expand Down Expand Up @@ -171,7 +172,8 @@ def transform(df: pd.DataFrame, model: Model) -> pd.DataFrame:
model: Model computed by fit.
Returns:
coord: Coordinates of the dataframe in the fitted space.
coord: Coordinates of the dataframe in the fitted space of size (n, min(n,p))
or (n, nf) if nf is specified.
"""
df_scaled = scaler(model, df)
coord = df_scaled @ model.V.T
Expand Down

0 comments on commit 844e95e

Please sign in to comment.