diff --git a/changelog/768.changed.md b/changelog/768.changed.md new file mode 100644 index 000000000..be31d72be --- /dev/null +++ b/changelog/768.changed.md @@ -0,0 +1,3 @@ +* Introduces AddSVDFeaturesStep as a dedicated preprocessing step for SVD feature generation +* Removes SVD-related functionality from ReshapeFeatureDistributionsStep +* Extracts utility functions to a new `tabpfn/preprocessing/steps/utils.py` module diff --git a/changelog/768.deprecated.md b/changelog/768.deprecated.md new file mode 100644 index 000000000..c6b38693a --- /dev/null +++ b/changelog/768.deprecated.md @@ -0,0 +1 @@ +Removes "scaler" as an option for `global_transformer_name` in `PreprocessorConfig` diff --git a/src/tabpfn/preprocessing/configs.py b/src/tabpfn/preprocessing/configs.py index 5d1e46f28..1a72d0326 100644 --- a/src/tabpfn/preprocessing/configs.py +++ b/src/tabpfn/preprocessing/configs.py @@ -99,7 +99,6 @@ class PreprocessorConfig: max_features_per_estimator: int = 500 global_transformer_name: ( Literal[ - "scaler", "svd", "svd_quarter_components", ] diff --git a/src/tabpfn/preprocessing/pipeline_factory.py b/src/tabpfn/preprocessing/pipeline_factory.py index 454bef4e2..3d6ec6e31 100644 --- a/src/tabpfn/preprocessing/pipeline_factory.py +++ b/src/tabpfn/preprocessing/pipeline_factory.py @@ -11,6 +11,7 @@ ) from tabpfn.preprocessing.steps import ( AddFingerprintFeaturesStep, + AddSVDFeaturesStep, DifferentiableZNormStep, EncodeCategoricalFeaturesStep, NanHandlingPolynomialFeaturesStep, @@ -46,6 +47,7 @@ def create_preprocessing_pipeline( """Convert the ensemble configuration to a preprocessing pipeline.""" steps: list[PreprocessingStep | StepWithModalities] = [] + pconfig = config.preprocess_config use_poly_features, max_poly_features = _polynomial_feature_settings( config.polynomial_features ) @@ -59,28 +61,36 @@ def create_preprocessing_pipeline( steps.append(RemoveConstantFeaturesStep()) - if config.preprocess_config.differentiable: + if pconfig.differentiable: steps.append(DifferentiableZNormStep()) else: - steps.extend( - [ - ReshapeFeatureDistributionsStep( - transform_name=config.preprocess_config.name, - append_to_original=config.preprocess_config.append_original, - max_features_per_estimator=config.preprocess_config.max_features_per_estimator, - global_transformer_name=config.preprocess_config.global_transformer_name, - apply_to_categorical=( - config.preprocess_config.categorical_name == "numeric" - ), + steps.append( + ReshapeFeatureDistributionsStep( + transform_name=pconfig.name, + append_to_original=pconfig.append_original, + max_features_per_estimator=pconfig.max_features_per_estimator, + apply_to_categorical=(pconfig.categorical_name == "numeric"), + random_state=random_state, + ) + ) + + use_global_transformer = ( + pconfig.global_transformer_name is not None + and pconfig.global_transformer_name != "None" + ) + if use_global_transformer: + steps.append( + AddSVDFeaturesStep( + global_transformer_name=pconfig.global_transformer_name, # type: ignore random_state=random_state, - ), - ( - EncodeCategoricalFeaturesStep( - config.preprocess_config.categorical_name, - random_state=random_state, - ) - ), - ], + ) + ) + + steps.append( + EncodeCategoricalFeaturesStep( + pconfig.categorical_name, + random_state=random_state, + ) ) if config.add_fingerprint_feature: diff --git a/src/tabpfn/preprocessing/steps/__init__.py b/src/tabpfn/preprocessing/steps/__init__.py index b26125bae..9a1ab4d1d 100644 --- a/src/tabpfn/preprocessing/steps/__init__.py +++ b/src/tabpfn/preprocessing/steps/__init__.py @@ -4,6 +4,9 @@ from .add_fingerprint_features_step import ( AddFingerprintFeaturesStep, ) +from .add_svd_features_step import ( + AddSVDFeaturesStep, +) from .differentiable_z_norm_step import ( DifferentiableZNormStep, ) @@ -31,6 +34,7 @@ __all__ = [ "AdaptiveQuantileTransformer", "AddFingerprintFeaturesStep", + "AddSVDFeaturesStep", "DifferentiableZNormStep", "EncodeCategoricalFeaturesStep", "KDITransformerWithNaN", diff --git a/src/tabpfn/preprocessing/steps/add_svd_features_step.py b/src/tabpfn/preprocessing/steps/add_svd_features_step.py new file mode 100644 index 000000000..fd6e31bb6 --- /dev/null +++ b/src/tabpfn/preprocessing/steps/add_svd_features_step.py @@ -0,0 +1,123 @@ +"""Adds SVD features to the data.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Literal +from typing_extensions import override + +from sklearn.decomposition import TruncatedSVD +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler + +from tabpfn.preprocessing.datamodel import FeatureModality, FeatureSchema +from tabpfn.preprocessing.pipeline_interface import PreprocessingStep +from tabpfn.preprocessing.steps.utils import make_standard_scaler_safe +from tabpfn.utils import infer_random_state + +if TYPE_CHECKING: + import numpy as np + + +class AddSVDFeaturesStep(PreprocessingStep): + """Adds SVD features to the data.""" + + def __init__( + self, + global_transformer_name: Literal[ + "svd", "svd_quarter_components" + ] = "svd_quarter_components", + random_state: int | np.random.Generator | None = None, + ): + """Initializes the AddSVDFeaturesStep.""" + super().__init__() + self.global_transformer_name = global_transformer_name + self.random_state = random_state + self.is_no_op: bool = False + + def num_added_features(self, n_samples: int, n_features: int) -> int: + """Return the number of added features.""" + if n_features < 2: + return 0 + + transformer = get_svd_features_transformer( + self.global_transformer_name, + n_samples, + n_features, + ) + return next( + s[1].n_components + for s in transformer.steps + if isinstance(s[1], TruncatedSVD) + ) + + @override + def _fit( + self, + X: np.ndarray, + feature_schema: FeatureSchema, + ) -> FeatureSchema: + n_samples, n_features = X.shape + if n_features < 2: + self.is_no_op = True + return feature_schema + + static_seed, _ = infer_random_state(self.random_state) + transformer = get_svd_features_transformer( + self.global_transformer_name, + n_samples, + n_features, + random_state=static_seed, + ) + transformer.fit(X) + + self.transformer_ = transformer + self.feature_schema_updated_ = feature_schema + + return feature_schema + + @override + def _transform( + self, X: np.ndarray, *, is_test: bool = False + ) -> tuple[np.ndarray, np.ndarray | None, FeatureModality | None]: + if self.is_no_op: + return X, None, None + + assert self.feature_schema_updated_ is not None + assert self.transformer_ is not None + + return X, self.transformer_.transform(X), FeatureModality.NUMERICAL + + +def get_svd_features_transformer( + global_transformer_name: Literal["svd", "svd_quarter_components"], + n_samples: int, + n_features: int, + random_state: int | None = None, +) -> Pipeline: + """Returns a transformer to add SVD features to the data.""" + if global_transformer_name == "svd": + divisor = 2 + elif global_transformer_name == "svd_quarter_components": + divisor = 4 + else: + raise ValueError(f"Invalid global transformer name: {global_transformer_name}.") + + n_components = max(1, min(n_samples // 10 + 1, n_features // divisor)) + return Pipeline( + steps=[ + ( + "save_standard", + make_standard_scaler_safe( + ("standard", StandardScaler(with_mean=False)), + ), + ), + ( + "svd", + TruncatedSVD( + algorithm="arpack", + n_components=n_components, + random_state=random_state, + ), + ), + ], + ) diff --git a/src/tabpfn/preprocessing/steps/reshape_feature_distribution_step.py b/src/tabpfn/preprocessing/steps/reshape_feature_distribution_step.py index c8ee07754..06773b9bd 100644 --- a/src/tabpfn/preprocessing/steps/reshape_feature_distribution_step.py +++ b/src/tabpfn/preprocessing/steps/reshape_feature_distribution_step.py @@ -3,28 +3,23 @@ from __future__ import annotations import contextlib -from copy import deepcopy -from typing import TYPE_CHECKING, Literal, TypeVar +from typing import TYPE_CHECKING, Literal from typing_extensions import override import numpy as np from scipy.stats import shapiro from sklearn.compose import ColumnTransformer, make_column_selector -from sklearn.decomposition import TruncatedSVD -from sklearn.impute import SimpleImputer from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import ( FunctionTransformer, MinMaxScaler, PowerTransformer, RobustScaler, - StandardScaler, ) from tabpfn.preprocessing.datamodel import FeatureModality, FeatureSchema from tabpfn.preprocessing.pipeline_interface import ( PreprocessingStep, - PreprocessingStepResult, ) from tabpfn.preprocessing.steps.adaptive_quantile_transformer import ( AdaptiveQuantileTransformer, @@ -35,64 +30,19 @@ ) from tabpfn.preprocessing.steps.safe_power_transformer import SafePowerTransformer from tabpfn.preprocessing.steps.squashing_scaler_transformer import SquashingScaler +from tabpfn.preprocessing.steps.utils import ( + add_safe_standard_to_safe_power_without_standard, +) from tabpfn.utils import infer_random_state if TYPE_CHECKING: from sklearn.base import TransformerMixin -T = TypeVar("T") - - -def _identity(x: T) -> T: - return x - - -def _inf_to_nan_func(x: np.ndarray) -> np.ndarray: - return np.nan_to_num(x, nan=np.nan, neginf=np.nan, posinf=np.nan) - def _exp_minus_1(x: np.ndarray) -> np.ndarray: return np.exp(x) - 1 # type: ignore -inf_to_nan_transformer = FunctionTransformer( - func=_inf_to_nan_func, - inverse_func=_identity, - check_inverse=False, -) -nan_impute_transformer = SimpleImputer( - missing_values=np.nan, - strategy="mean", - # keep empty features for inverse to function - keep_empty_features=True, -) -nan_impute_transformer.inverse_transform = ( - _identity # do not inverse np.nan values. # type: ignore -) - -_make_finite_transformer = [ - ("inf_to_nan", inf_to_nan_transformer), - ("nan_impute", nan_impute_transformer), -] - - -def _make_standard_scaler_safe( - _name_scaler_tuple: tuple[str, TransformerMixin], - *, - no_name: bool = False, -) -> Pipeline: - # Make sure that all data that enters and leaves a scaler is finite. - # This is needed in edge cases where, for example, a division by zero - # occurs while scaling or when the input contains not number values. - return Pipeline( - steps=[ - *[(n + "_pre ", deepcopy(t)) for n, t in _make_finite_transformer], - ("placeholder", _name_scaler_tuple) if no_name else _name_scaler_tuple, - *[(n + "_post", deepcopy(t)) for n, t in _make_finite_transformer], - ], - ) - - def _make_box_cox_safe(input_transformer: TransformerMixin | Pipeline) -> Pipeline: """Make box cox save. @@ -108,20 +58,6 @@ def _make_box_cox_safe(input_transformer: TransformerMixin | Pipeline) -> Pipeli ) -def _add_safe_standard_to_safe_power_without_standard( - input_transformer: TransformerMixin, -) -> Pipeline: - """In edge cases PowerTransformer can create inf values and similar. Then, the post - standard scale crashes. This fixes this issue. - """ - return Pipeline( - steps=[ - ("input_transformer", input_transformer), - ("standard", _make_standard_scaler_safe(("standard", StandardScaler()))), - ], - ) - - def _skew(x: np.ndarray) -> float: """skewness: 3 * (mean - median) / std.""" return float(3 * (np.nanmean(x, 0) - np.nanmedian(x, 0)) / np.std(x, 0)) @@ -134,7 +70,6 @@ class ReshapeFeatureDistributionsStep(PreprocessingStep): 1. Handles feature subsampling when too many features exist 2. Applies different logic based on `apply_to_categorical` flag 3. Can append transformed features to originals (`append_to_original`) - 4. Can add global features like SVD components # TODO(ben): Add separate PreprocessingStep's for all of the above # so that we can register this with modalities @@ -195,7 +130,6 @@ def __init__( apply_to_categorical: bool = False, append_to_original: bool | Literal["auto"] = False, max_features_per_estimator: int = 500, - global_transformer_name: str | None = None, random_state: int | np.random.Generator | None = None, ): super().__init__() @@ -208,10 +142,9 @@ def __init__( self.append_to_original = append_to_original self.random_state = random_state self.max_features_per_estimator = max_features_per_estimator - self.global_transformer_name = global_transformer_name self.transformer_: Pipeline | ColumnTransformer | None = None - def _create_transformers_and_new_schema( # noqa: PLR0912 + def _create_transformers_and_new_schema( self, n_samples: int, n_features: int, @@ -245,22 +178,6 @@ def _create_transformers_and_new_schema( # noqa: PLR0912 else: self.subsampled_features_ = np.arange(n_features) - if ( - self.global_transformer_name is not None - and self.global_transformer_name != "None" - and not ( - self.global_transformer_name in ["svd", "svd_quarter_components"] - and n_features < 2 - ) - ): - global_transformer_ = get_all_global_transformers( - n_samples, - n_features, - random_state=static_seed, - )[self.global_transformer_name] - else: - global_transformer_ = None - all_feats_ix = list(range(n_features)) transformers = [] @@ -328,38 +245,16 @@ def _create_transformers_and_new_schema( # noqa: PLR0912 sparse_threshold=0.0, # No sparse ) - # Apply a global transformer which accepts the entire dataset instead of - # one column - # NOTE: We assume global_transformer does not destroy the semantic meaning of - # categorical_features_. - n_new_global_features = 0 - if global_transformer_: - transformer = Pipeline( - [ - ("preprocess", transformer), - ("global_transformer", global_transformer_), - ], - ) - # TODO: Find better way to get number of global features added - n_new_global_features = next( - s[1].n_components - for s in global_transformer_.transformer_list[1][1].steps - if isinstance(s[1], TruncatedSVD) - ) - self.transformer_ = transformer - self.n_new_global_features_ = n_new_global_features # Compute output feature count for modality update - # Include: base features + appended transformed (if append_to_original) + SVD + # Include: base features + appended transformed (if append_to_original) n_output_features = ( n_features + len(trans_ixs) if self.append_to_original else n_features ) - n_output_features += n_new_global_features # Build the new metadata with updated categorical indices # Non-categorical indices become numerical - # SVD features are numerical and appended at the end new_schema = FeatureSchema.from_only_categorical_indices( categorical_indices=sorted(cat_ix), num_columns=n_output_features, @@ -383,23 +278,6 @@ def _fit( self.transformer_ = transformer return output_schema - @override - def fit_transform( - self, - X: np.ndarray, - feature_schema: FeatureSchema, - ) -> PreprocessingStepResult: - n_samples, n_features = X.shape - transformer, output_schema = self._create_transformers_and_new_schema( - n_samples, - n_features, - feature_schema, - ) - Xt = transformer.fit_transform(X[:, self.subsampled_features_]) - self.transformer_ = transformer - self.feature_schema_updated_ = output_schema - return PreprocessingStepResult(X=Xt, feature_schema=output_schema) # type: ignore[arg-type] - @override def _transform( self, X: np.ndarray, *, is_test: bool = False @@ -408,82 +286,6 @@ def _transform( return self.transformer_.transform(X[:, self.subsampled_features_]), None, None # type: ignore -def get_all_global_transformers( - num_examples: int, - num_features: int, - random_state: int | None = None, -) -> dict[str, FeatureUnion | Pipeline]: - """Returns a dictionary of global transformers to transform the data.""" - return { - "scaler": _make_standard_scaler_safe(("standard", StandardScaler())), - "svd": FeatureUnion( - [ - # default FunctionTransformer yields the identity function - ("passthrough", FunctionTransformer()), - ( - "svd", - Pipeline( - steps=[ - ( - "save_standard", - _make_standard_scaler_safe( - ("standard", StandardScaler(with_mean=False)), - ), - ), - ( - "svd", - TruncatedSVD( - algorithm="arpack", - n_components=max( - 1, - min( - num_examples // 10 + 1, - num_features // 2, - ), - ), - random_state=random_state, - ), - ), - ], - ), - ), - ], - ), - "svd_quarter_components": FeatureUnion( - [ - ("passthrough", FunctionTransformer(func=_identity)), - ( - "svd", - Pipeline( - steps=[ - ( - "save_standard", - _make_standard_scaler_safe( - ("standard", StandardScaler(with_mean=False)), - ), - ), - ( - "svd", - TruncatedSVD( - algorithm="arpack", - n_components=max( - 1, - min( - num_examples // 10 + 1, - num_features // 4, - ), - ), - random_state=random_state, - ), - ), - ], - ), - ), - ], - ), - } - - def get_adaptive_preprocessors( num_examples: int = 100, random_state: int | None = None, @@ -513,7 +315,7 @@ def get_adaptive_preprocessors( ( "skewed_pos", _make_box_cox_safe( - _add_safe_standard_to_safe_power_without_standard( + add_safe_standard_to_safe_power_without_standard( SafePowerTransformer( standardize=False, method="box-cox", @@ -524,7 +326,7 @@ def get_adaptive_preprocessors( ), ( "skewed", - _add_safe_standard_to_safe_power_without_standard( + add_safe_standard_to_safe_power_without_standard( SafePowerTransformer( standardize=False, method="yeo-johnson", @@ -567,19 +369,19 @@ def get_all_reshape_feature_distribution_preprocessors( ) -> dict[str, TransformerMixin | Pipeline]: """Returns a dictionary of preprocessing to preprocess the data.""" all_preprocessors = { - "power": _add_safe_standard_to_safe_power_without_standard( + "power": add_safe_standard_to_safe_power_without_standard( PowerTransformer(standardize=False), ), - "safepower": _add_safe_standard_to_safe_power_without_standard( + "safepower": add_safe_standard_to_safe_power_without_standard( SafePowerTransformer(standardize=False), ), "power_box": _make_box_cox_safe( - _add_safe_standard_to_safe_power_without_standard( + add_safe_standard_to_safe_power_without_standard( PowerTransformer(standardize=False, method="box-cox"), ), ), "safepower_box": _make_box_cox_safe( - _add_safe_standard_to_safe_power_without_standard( + add_safe_standard_to_safe_power_without_standard( SafePowerTransformer(standardize=False, method="box-cox"), ), ), diff --git a/src/tabpfn/preprocessing/steps/utils.py b/src/tabpfn/preprocessing/steps/utils.py new file mode 100644 index 000000000..6bcbd0f1d --- /dev/null +++ b/src/tabpfn/preprocessing/steps/utils.py @@ -0,0 +1,78 @@ +"""Utility functions for preprocessing steps.""" + +from __future__ import annotations + +from copy import deepcopy +from typing import TYPE_CHECKING, TypeVar + +import numpy as np +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import FunctionTransformer, StandardScaler + +if TYPE_CHECKING: + from sklearn.base import TransformerMixin + +T = TypeVar("T") + + +def _identity(x: T) -> T: + return x + + +def _inf_to_nan_func(x: np.ndarray) -> np.ndarray: + return np.nan_to_num(x, nan=np.nan, neginf=np.nan, posinf=np.nan) + + +inf_to_nan_transformer = FunctionTransformer( + func=_inf_to_nan_func, + inverse_func=_identity, + check_inverse=False, +) +nan_impute_transformer = SimpleImputer( + missing_values=np.nan, + strategy="mean", + # keep empty features for inverse to function + keep_empty_features=True, +) +nan_impute_transformer.inverse_transform = ( + _identity # do not inverse np.nan values. # type: ignore +) + +_make_finite_transformer = [ + ("inf_to_nan", inf_to_nan_transformer), + ("nan_impute", nan_impute_transformer), +] + + +def add_safe_standard_to_safe_power_without_standard( + input_transformer: TransformerMixin, +) -> Pipeline: + """In edge cases PowerTransformer can create inf values and similar. Then, the post + standard scale crashes. This fixes this issue. + """ + return Pipeline( + steps=[ + ("input_transformer", input_transformer), + ("standard", make_standard_scaler_safe(("standard", StandardScaler()))), + ], + ) + + +def make_standard_scaler_safe( + _name_scaler_tuple: tuple[str, TransformerMixin], + *, + no_name: bool = False, +) -> Pipeline: + """Make sure that all data that enters and leaves a scaler is finite. + + This is needed in edge cases where, for example, a division by zero + occurs while scaling or when the input contains not number values. + """ + return Pipeline( + steps=[ + *[(n + "_pre ", deepcopy(t)) for n, t in _make_finite_transformer], + ("placeholder", _name_scaler_tuple) if no_name else _name_scaler_tuple, + *[(n + "_post", deepcopy(t)) for n, t in _make_finite_transformer], + ], + ) diff --git a/tests/test_preprocessing/test_add_fingerprint_features_step.py b/tests/test_preprocessing/test_add_fingerprint_features_step.py index a96be1ba0..c9597a4e2 100644 --- a/tests/test_preprocessing/test_add_fingerprint_features_step.py +++ b/tests/test_preprocessing/test_add_fingerprint_features_step.py @@ -5,6 +5,7 @@ import numpy as np import torch +from tabpfn.preprocessing import PreprocessingPipeline from tabpfn.preprocessing.datamodel import Feature, FeatureModality, FeatureSchema from tabpfn.preprocessing.steps.add_fingerprint_features_step import ( AddFingerprintFeaturesStep, @@ -122,3 +123,16 @@ def test__fit__does_not_modify_metadata() -> None: # Metadata should be unchanged - same number of columns assert result_schema.num_columns == 3 assert result_schema.indices_for(FeatureModality.NUMERICAL) == [0, 1, 2] + + +def test__in_pipeline__returns_added_columns() -> None: + """Test that the step returns added columns when used in a pipeline.""" + data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32) + schema = _get_schema(num_columns=3) + + step = AddFingerprintFeaturesStep(random_state=42) + pipeline = PreprocessingPipeline(steps=[(step, {FeatureModality.NUMERICAL})]) + result = pipeline.fit_transform(data, schema) + + assert result.feature_schema.num_columns == 4 + assert result.X.shape == (2, 4) diff --git a/tests/test_preprocessing/test_add_svd_features_step.py b/tests/test_preprocessing/test_add_svd_features_step.py new file mode 100644 index 000000000..c35859cf4 --- /dev/null +++ b/tests/test_preprocessing/test_add_svd_features_step.py @@ -0,0 +1,218 @@ +"""Tests for AddSVDFeaturesStep.""" + +from __future__ import annotations + +import numpy as np +import pytest + +from tabpfn.preprocessing import PreprocessingPipeline +from tabpfn.preprocessing.datamodel import Feature, FeatureModality, FeatureSchema +from tabpfn.preprocessing.steps.add_svd_features_step import ( + AddSVDFeaturesStep, + get_svd_features_transformer, +) + + +def _get_schema(num_columns: int) -> FeatureSchema: + """Create a schema with all numerical features.""" + return FeatureSchema( + features=[ + Feature(name=None, modality=FeatureModality.NUMERICAL) + for _ in range(num_columns) + ] + ) + + +def _get_test_data( + n_samples: int = 100, n_features: int = 10, seed: int = 42 +) -> np.ndarray: + """Create test data with some structure for SVD to capture.""" + rng = np.random.default_rng(seed) + # Create data with some latent structure + latent = rng.standard_normal((n_samples, 3)) + weights = rng.standard_normal((3, n_features)) + noise = rng.standard_normal((n_samples, n_features)) * 0.1 + with np.errstate(all="ignore"): + return (latent @ weights + noise).astype(np.float32) + + +def test__transform__returns_x_unchanged_and_svd_in_added_columns() -> None: + """Test that _transform returns X unchanged, SVD features in added_columns.""" + data = _get_test_data(n_samples=50, n_features=6) + step = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + step._fit(data, _get_schema(num_columns=6)) + result, added_cols, modality = step._transform(data) + + # X should be returned unchanged + assert isinstance(result, np.ndarray) + assert result.shape == data.shape + np.testing.assert_array_equal(result, data) + + # SVD features should be in added_columns + assert added_cols is not None + assert added_cols.shape[0] == data.shape[0] + assert added_cols.shape[1] > 0 # Should have some SVD components + assert modality == FeatureModality.NUMERICAL + + +def test__transform__with_svd_quarter_components() -> None: + """Test that svd_quarter_components produces fewer components than svd.""" + data = _get_test_data(n_samples=100, n_features=20) + + step_svd = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + step_svd._fit(data, _get_schema(num_columns=20)) + _, added_svd, _ = step_svd._transform(data) + + step_quarter = AddSVDFeaturesStep( + global_transformer_name="svd_quarter_components", random_state=42 + ) + step_quarter._fit(data, _get_schema(num_columns=20)) + _, added_quarter, _ = step_quarter._transform(data) + + assert added_svd is not None + assert added_quarter is not None + # Quarter components should have fewer or equal columns + assert added_quarter.shape[1] <= added_svd.shape[1] + + +def test__transform__with_single_feature_returns_unchanged() -> None: + """Test that single feature data is returned unchanged without SVD.""" + data = np.array([[1.0], [2.0], [3.0], [4.0]], dtype=np.float32) + schema = _get_schema(num_columns=1) + step = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + updated_schema = step._fit(data, schema) + + # Schema should be unchanged + assert updated_schema.num_columns == 1 + + # Transformer should not be set for single feature + assert not hasattr(step, "transformer_") or step.transformer_ is None + + +def test__fit_transform__returns_added_columns() -> None: + """Test fit_transform returns X unchanged with SVD in added_columns.""" + data = _get_test_data(n_samples=50, n_features=6) + schema = _get_schema(num_columns=6) + + step = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + result = step.fit_transform(data, schema) + + # X should be unchanged + assert result.X.shape == data.shape + np.testing.assert_array_equal(result.X, data) + + # Schema should be unchanged (pipeline handles adding SVD) + assert result.feature_schema.num_columns == 6 + + # SVD features should be in added_columns + assert result.X_added is not None + assert result.X_added.shape[0] == data.shape[0] + assert result.modality_added == FeatureModality.NUMERICAL + + +def test__transform__returns_added_columns_after_fit() -> None: + """Test transform returns X unchanged with SVD in added_columns.""" + data_train = _get_test_data(n_samples=50, n_features=6, seed=42) + data_test = _get_test_data(n_samples=20, n_features=6, seed=123) + schema = _get_schema(num_columns=6) + + step = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + step.fit_transform(data_train, schema) + result = step.transform(data_test) + + # X should be unchanged + assert result.X.shape == data_test.shape + + # SVD features should be in added_columns + assert result.X_added is not None + assert result.X_added.shape[0] == data_test.shape[0] + + +def test__num_output_features__returns_correct_count() -> None: + """Test num_output_features returns the expected count.""" + step = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + + # For n_features=10, n_samples=100: + # n_components = min(100//10+1, 10//2) = min(11, 5) = 5 + result = step.num_added_features(n_features=10, n_samples=100) + assert result == 5 + + # For n_features=1 (less than 2), should return unchanged + result_single = step.num_added_features(n_features=1, n_samples=100) + assert result_single == 0 + + +def test__in_pipeline__returns_added_columns() -> None: + """Test that the step returns added columns when used in a pipeline.""" + data = _get_test_data(n_samples=50, n_features=6) + schema = _get_schema(num_columns=6) + + step = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + pipeline = PreprocessingPipeline(steps=[(step, {FeatureModality.NUMERICAL})]) + result = pipeline.fit_transform(data, schema) + + # Should have original columns plus SVD columns + assert result.feature_schema.num_columns > 6 + assert result.X.shape[1] > 6 + assert result.X.shape[0] == data.shape[0] + + +def test__in_pipeline__transform_consistent_with_fit_transform() -> None: + """Test that transform produces same shape as fit_transform.""" + data_train = _get_test_data(n_samples=50, n_features=6, seed=42) + data_test = _get_test_data(n_samples=20, n_features=6, seed=123) + schema = _get_schema(num_columns=6) + + step = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + pipeline = PreprocessingPipeline(steps=[(step, {FeatureModality.NUMERICAL})]) + + fit_result = pipeline.fit_transform(data_train, schema) + transform_result = pipeline.transform(data_test) + + assert fit_result.X.shape[1] == transform_result.X.shape[1] + assert ( + fit_result.feature_schema.num_columns + == transform_result.feature_schema.num_columns + ) + + +def test__in_pipeline__with_no_modality_selection() -> None: + """Test that the step returns added columns when used in a pipeline.""" + data = _get_test_data(n_samples=50, n_features=6) + schema = _get_schema(num_columns=6) + + step = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + pipeline = PreprocessingPipeline(steps=[step]) + result = pipeline.fit_transform(data, schema) + + # Should have original columns plus SVD columns + assert result.feature_schema.num_columns > 6 + assert result.X.shape[1] > 6 + assert result.X.shape[0] == data.shape[0] + + +def test__random_state__produces_reproducible_results() -> None: + """Test that same random_state produces identical results.""" + data = _get_test_data(n_samples=50, n_features=6) + schema = _get_schema(num_columns=6) + + step1 = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + result1 = step1.fit_transform(data, schema) + + step2 = AddSVDFeaturesStep(global_transformer_name="svd", random_state=42) + result2 = step2.fit_transform(data, schema) + + assert result1.X_added is not None + assert result2.X_added is not None + np.testing.assert_array_almost_equal(result1.X_added, result2.X_added) + + +def test__get_svd_features_transformer__invalid_name_raises() -> None: + """Test that invalid transformer name raises ValueError.""" + with pytest.raises(ValueError, match="Invalid global transformer name"): + # Create an invalid enum value by bypassing the enum + get_svd_features_transformer( + "invalid_name", # type: ignore[arg-type] + n_samples=100, + n_features=10, + ) diff --git a/tests/test_preprocessing/test_preprocessing_steps.py b/tests/test_preprocessing/test_preprocessing_steps.py index f4856a9dd..cd85c25a7 100644 --- a/tests/test_preprocessing/test_preprocessing_steps.py +++ b/tests/test_preprocessing/test_preprocessing_steps.py @@ -46,7 +46,6 @@ def _get_preprocessing_steps() -> list[Callable[..., PreprocessingStep],]: ReshapeFeatureDistributionsStep, transform_name="none", append_to_original=True, - global_transformer_name="svd", apply_to_categorical=False, ) ] diff --git a/tests/test_preprocessing/test_reshape_feature_distribution_step.py b/tests/test_preprocessing/test_reshape_feature_distribution_step.py index 49cc48419..18ba53996 100644 --- a/tests/test_preprocessing/test_reshape_feature_distribution_step.py +++ b/tests/test_preprocessing/test_reshape_feature_distribution_step.py @@ -7,7 +7,6 @@ import numpy as np import pytest -from tabpfn.preprocessing import PreprocessingPipeline from tabpfn.preprocessing.datamodel import Feature, FeatureModality, FeatureSchema from tabpfn.preprocessing.steps import ReshapeFeatureDistributionsStep @@ -421,7 +420,6 @@ def test__preprocessing_large_dataset(): apply_to_categorical=False, append_to_original=False, max_features_per_estimator=500, - global_transformer_name=None, random_state=42, ) @@ -478,73 +476,3 @@ def test__reshape_step_append_original_logic( assert result.X.shape[0] == num_samples assert result.X.shape[1] == expected_output_features - - -def test__reshape__with_global_transformer(): - """Test that the step works with a global transformer. - - Note: SVD global transformer uses FeatureUnion with passthrough + SVD, - so output has original_features + svd_components columns. - SVD n_components = min(n_samples//10+1, n_features//2) = min(11, 4) = 4 - - Known limitation: Metadata tracking for categorical indices is not fully - accurate when using global transformers with ColumnTransformer reordering. - This will be addressed when refactoring to centralized column orchestration. - """ - rng = np.random.default_rng(42) - n_samples, n_features = 100, 8 - cat_indices = [1, 5] - n_svd_components = 4 # min(100//10+1, 8//2) = min(11, 4) - n_output_features = n_features + n_svd_components - - X = _make_test_data(rng, n_samples, n_features, cat_indices) - feature_modalities = _make_metadata(n_features, cat_indices) - - step = ReshapeFeatureDistributionsStep( - transform_name="none", - apply_to_categorical=False, - append_to_original=False, - global_transformer_name="svd", - random_state=42, - ) - result = step.fit_transform(X, feature_modalities) - - # SVD adds components via FeatureUnion (passthrough + SVD) - assert result.X.shape == (n_samples, n_output_features) - # Verify we have some categorical and numerical indices tracked - # (exact positions may vary due to ColumnTransformer reordering) - assert len(result.feature_schema.indices_for(FeatureModality.CATEGORICAL)) == len( - cat_indices - ) - assert len(result.feature_schema.indices_for(FeatureModality.NUMERICAL)) > 0 - - -# TODO: Split ReshapeDistributionStep into multiple steps so that this pipeline passes -def test__pipeline_with_reshape__with_indizes(): - rng = np.random.default_rng(42) - n_samples, n_features = 100, 8 - cat_indices = [1, 5] - X = _make_test_data(rng, n_samples, n_features, cat_indices) - - step = ReshapeFeatureDistributionsStep( - transform_name="none", - apply_to_categorical=False, - append_to_original=False, - global_transformer_name="svd_quarter_components", - random_state=42, - ) - pipeline = PreprocessingPipeline( - steps=[(step, {FeatureModality.NUMERICAL, FeatureModality.CATEGORICAL})] - ) - # Provide complete metadata covering all 8 features - numerical_indices = [i for i in range(n_features) if i not in cat_indices] - features = [ - Feature(name=None, modality=FeatureModality.NUMERICAL) - for _ in numerical_indices - ] + [Feature(name=None, modality=FeatureModality.CATEGORICAL) for _ in cat_indices] - schema = FeatureSchema(features=features) # type: ignore - with pytest.raises( - ValueError, - match="Steps registered with modalities must return the same number of columns", - ): - _ = pipeline.fit_transform(X, schema)