Refactor SVD features preprocessing step #768

alanprior · 2026-02-06T00:07:52Z

What did scaler mean here?

alanprior · 2026-02-05T23:59:43Z

@bejaeger can we elaborate slightly more the docstrings? What we do here is add, on top of the raw X, also more features that are just a compressed version of them? If so - I get it for numerical features, but a bit weird for categorical, and non-applicable for text?

alanprior · 2026-02-06T00:04:14Z

@bejaeger I find this syntax hard to read. Why do we have next here? I'm confused

alanprior · 2026-02-06T00:03:28Z

@bejaeger at this step we expect many columns to already be normalized, right? and this is general is to learn a more balanced projection, but unrelated to the original features?

-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    * Introduces AddSVDFeaturesStep as a dedicated preprocessing step for SVD feature generation
+    * Removes SVD-related functionality from ReshapeFeatureDistributionsStep
+    * Extracts utility functions to a new `tabpfn/preprocessing/steps/utils.py` module

Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Removes "scaler" as an option for `global_transformer_name` in `PreprocessorConfig`

-Original file line number
+Diff line change
@@ Expand Up / @@ -99,7 +99,6 @@ class PreprocessorConfig: @@
         max_features_per_estimator: int = 500
         global_transformer_name: (
             Literal[
-                "scaler",
                 "svd",
                 "svd_quarter_components",
             ]
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ @@
     )
     from tabpfn.preprocessing.steps import (
         AddFingerprintFeaturesStep,
+        AddSVDFeaturesStep,
         DifferentiableZNormStep,
         EncodeCategoricalFeaturesStep,
         NanHandlingPolynomialFeaturesStep,
@@ Expand Down Expand Up / @@ -46,6 +47,7 @@ def create_preprocessing_pipeline( @@
         """Convert the ensemble configuration to a preprocessing pipeline."""
         steps: list[PreprocessingStep | StepWithModalities] = []
+        pconfig = config.preprocess_config
         use_poly_features, max_poly_features = _polynomial_feature_settings(
             config.polynomial_features
         )
@@ Expand All / @@ -59,28 +61,36 @@ def create_preprocessing_pipeline( @@
         steps.append(RemoveConstantFeaturesStep())
-        if config.preprocess_config.differentiable:
+        if pconfig.differentiable:
             steps.append(DifferentiableZNormStep())
         else:
-            steps.extend(
-                [
-                    ReshapeFeatureDistributionsStep(
-                        transform_name=config.preprocess_config.name,
-                        append_to_original=config.preprocess_config.append_original,
-                        max_features_per_estimator=config.preprocess_config.max_features_per_estimator,
-                        global_transformer_name=config.preprocess_config.global_transformer_name,
-                        apply_to_categorical=(
-                            config.preprocess_config.categorical_name == "numeric"
-                        ),
+            steps.append(
+                ReshapeFeatureDistributionsStep(
+                    transform_name=pconfig.name,
+                    append_to_original=pconfig.append_original,
+                    max_features_per_estimator=pconfig.max_features_per_estimator,
+                    apply_to_categorical=(pconfig.categorical_name == "numeric"),
+                    random_state=random_state,
+                )
+            )
+            use_global_transformer = (
+                pconfig.global_transformer_name is not None
+                and pconfig.global_transformer_name != "None"
+            )
+            if use_global_transformer:
+                steps.append(
+                    AddSVDFeaturesStep(
+                        global_transformer_name=pconfig.global_transformer_name,  # type: ignore
                         random_state=random_state,
-                    ),
-                    (
-                        EncodeCategoricalFeaturesStep(
-                            config.preprocess_config.categorical_name,
-                            random_state=random_state,
-                        )
-                    ),
-                ],
+                    )
+                )
+            steps.append(
+                EncodeCategoricalFeaturesStep(
+                    pconfig.categorical_name,
+                    random_state=random_state,
+                )
             )
         if config.add_fingerprint_feature:
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,9 @@ @@
     from .add_fingerprint_features_step import (
         AddFingerprintFeaturesStep,
     )
+    from .add_svd_features_step import (
+        AddSVDFeaturesStep,
+    )
     from .differentiable_z_norm_step import (
         DifferentiableZNormStep,
     )
@@ Expand Down Expand Up / @@ -31,6 +34,7 @@ @@
     __all__ = [
         "AdaptiveQuantileTransformer",
         "AddFingerprintFeaturesStep",
+        "AddSVDFeaturesStep",
         "DifferentiableZNormStep",
         "EncodeCategoricalFeaturesStep",
         "KDITransformerWithNaN",
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Refactor SVD features preprocessing step #768

Uh oh!

Diff view

Diff view

There are no files selected for viewing

alanprior Feb 6, 2026

Uh oh!

Uh oh!

alanprior Feb 5, 2026

Uh oh!

alanprior Feb 6, 2026

Uh oh!

alanprior Feb 6, 2026

Uh oh!

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ -0,0 +1,123 @@
+    """Adds SVD features to the data."""
+    from __future__ import annotations
+    from typing import TYPE_CHECKING, Literal
+    from typing_extensions import override
+    from sklearn.decomposition import TruncatedSVD
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from tabpfn.preprocessing.datamodel import FeatureModality, FeatureSchema
+    from tabpfn.preprocessing.pipeline_interface import PreprocessingStep
+    from tabpfn.preprocessing.steps.utils import make_standard_scaler_safe
+    from tabpfn.utils import infer_random_state
+    if TYPE_CHECKING:
+        import numpy as np
+    class AddSVDFeaturesStep(PreprocessingStep):
+        """Adds SVD features to the data."""
+        def __init__(
+            self,
+            global_transformer_name: Literal[
+                "svd", "svd_quarter_components"
+            ] = "svd_quarter_components",
+            random_state: int | np.random.Generator | None = None,
+        ):
+            """Initializes the AddSVDFeaturesStep."""
+            super().__init__()
+            self.global_transformer_name = global_transformer_name
+            self.random_state = random_state
+            self.is_no_op: bool = False
+        def num_added_features(self, n_samples: int, n_features: int) -> int:
+            """Return the number of added features."""
+            if n_features < 2:
+                return 0
+            transformer = get_svd_features_transformer(
+                self.global_transformer_name,
+                n_samples,
+                n_features,
+            )
+            return next(
+                s[1].n_components
+                for s in transformer.steps
+                if isinstance(s[1], TruncatedSVD)
+            )
+        @override
+        def _fit(
+            self,
+            X: np.ndarray,
+            feature_schema: FeatureSchema,
+        ) -> FeatureSchema:
+            n_samples, n_features = X.shape
+            if n_features < 2:
+                self.is_no_op = True
+                return feature_schema
+            static_seed, _ = infer_random_state(self.random_state)
+            transformer = get_svd_features_transformer(
+                self.global_transformer_name,
+                n_samples,
+                n_features,
+                random_state=static_seed,
+            )
+            transformer.fit(X)
+            self.transformer_ = transformer
+            self.feature_schema_updated_ = feature_schema
+            return feature_schema
+        @override
+        def _transform(
+            self, X: np.ndarray, *, is_test: bool = False
+        ) -> tuple[np.ndarray, np.ndarray | None, FeatureModality | None]:
+            if self.is_no_op:
+                return X, None, None
+            assert self.feature_schema_updated_ is not None
+            assert self.transformer_ is not None
+            return X, self.transformer_.transform(X), FeatureModality.NUMERICAL
+    def get_svd_features_transformer(
+        global_transformer_name: Literal["svd", "svd_quarter_components"],
+        n_samples: int,
+        n_features: int,
+        random_state: int | None = None,
+    ) -> Pipeline:
+        """Returns a transformer to add SVD features to the data."""
+        if global_transformer_name == "svd":
+            divisor = 2
+        elif global_transformer_name == "svd_quarter_components":
+            divisor = 4
+        else:
+            raise ValueError(f"Invalid global transformer name: {global_transformer_name}.")
+        n_components = max(1, min(n_samples // 10 + 1, n_features // divisor))
+        return Pipeline(
+            steps=[
+                (
+                    "save_standard",
+                    make_standard_scaler_safe(
+                        ("standard", StandardScaler(with_mean=False)),
+                    ),
+                ),
+                (
+                    "svd",
+                    TruncatedSVD(
+                        algorithm="arpack",
+                        n_components=n_components,
+                        random_state=random_state,
+                    ),
+                ),
+            ],
+        )

Refactor SVD features preprocessing step #768

Are you sure you want to change the base?

Uh oh!

Refactor SVD features preprocessing step #768

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

alanprior Feb 6, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

alanprior Feb 5, 2026

Choose a reason for hiding this comment

Uh oh!

alanprior Feb 6, 2026

Choose a reason for hiding this comment

Uh oh!

alanprior Feb 6, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!