Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
06af2a0
tweak
bejaeger Jan 29, 2026
d248833
add reference predictions
bejaeger Jan 29, 2026
4f71386
use np.load
bejaeger Jan 29, 2026
fb647fa
Merge branch 'main' into ben/add-pipeline-consistency-test
bejaeger Jan 29, 2026
39e632e
200 -> 100 samples
bejaeger Jan 29, 2026
a6044ab
100 -> 50 samples
bejaeger Jan 29, 2026
f3e3ee3
Introduce feature modalities, add TabPFNLabelEncoder
bejaeger Jan 28, 2026
ec14a6f
use columnmetadata
bejaeger Jan 28, 2026
c579f38
update pipeline and add pipeline consistency tests
bejaeger Jan 29, 2026
00c481e
clean up consistency tests
bejaeger Jan 29, 2026
a336463
update consistency tests
bejaeger Jan 29, 2026
fd44cb3
update references
bejaeger Jan 29, 2026
f0a81c9
Merge branch 'ben/add-pipeline-consistency-test' into ben/introduce-f…
bejaeger Jan 29, 2026
a2ec121
skip test when not backwards compatible
bejaeger Jan 29, 2026
d55bf8a
Merge branch 'main' into ben/introduce-feature-modality-dict
bejaeger Jan 29, 2026
36d3d2c
cleanup
bejaeger Jan 29, 2026
59af1f2
more cleanup
bejaeger Jan 29, 2026
07c2dcd
rename ensemble classes
bejaeger Jan 30, 2026
ae216ba
rename file
bejaeger Jan 30, 2026
3a5816e
rename and cleanup
bejaeger Jan 30, 2026
f845980
feature modalities -> feature metadata
bejaeger Jan 30, 2026
280a1de
feature metadata -> feature schema
bejaeger Jan 30, 2026
62114bd
cleanup
bejaeger Jan 30, 2026
ad853c1
Merge branch 'main' into ben/introduce-feature-modality-dict
bejaeger Jan 30, 2026
57e5eef
remove old test file
bejaeger Jan 30, 2026
20687cf
add back sklearn compatible error
bejaeger Jan 30, 2026
712f1be
fix fit_transform in kid transform
bejaeger Jan 30, 2026
1b1b81e
have kdi test for fit and fit_transform
bejaeger Jan 30, 2026
08be623
fix
bejaeger Jan 30, 2026
f55689d
cleanup and add changelog
bejaeger Jan 30, 2026
8f51ac9
improve description of feature schema updated
bejaeger Feb 1, 2026
e67258a
fix attribute name
bejaeger Feb 1, 2026
0faade8
Squash feature branch commits
bejaeger Feb 1, 2026
ba80aac
fix issue from merging
bejaeger Feb 1, 2026
7c1180b
add changelog
bejaeger Feb 1, 2026
f114e0e
cleanup
bejaeger Feb 3, 2026
9a7a2b4
revision
bejaeger Feb 3, 2026
d1eeb07
Squash feature branch commits
bejaeger Feb 1, 2026
8a4c482
fix issue from merging
bejaeger Feb 1, 2026
e36f561
add changelog
bejaeger Feb 1, 2026
0d64858
revision
bejaeger Feb 3, 2026
8e4361e
Merge branch 'ben/refactor-svd-transform' of github.com:PriorLabs/Tab…
bejaeger Feb 3, 2026
e5a2ed8
tweak
bejaeger Feb 3, 2026
650ff00
revision
bejaeger Feb 4, 2026
e731404
fix test
bejaeger Feb 4, 2026
a6103e4
revision 2
bejaeger Feb 5, 2026
dbfaef3
Squash feature branch commits
bejaeger Feb 1, 2026
af39010
fix issue from merging
bejaeger Feb 1, 2026
b8f7a1c
add changelog
bejaeger Feb 1, 2026
17a7f07
revision
bejaeger Feb 3, 2026
0b96218
tweak
bejaeger Feb 3, 2026
53b9941
Merge branch 'ben/refactor-svd-transform' of github.com:PriorLabs/Tab…
bejaeger Feb 5, 2026
6d94a4a
Make SVD preprocessing a separate step
bejaeger Feb 5, 2026
a5c9e40
Merge branch 'ben/refactor-svd-transform' of github.com:PriorLabs/Tab…
bejaeger Feb 5, 2026
4569471
Merge branch 'main' into ben/refactor-svd-transform
bejaeger Feb 5, 2026
5212211
add changelog and cleanup tests
bejaeger Feb 5, 2026
d3f7ecf
revision
bejaeger Feb 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/768.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* Introduces AddSVDFeaturesStep as a dedicated preprocessing step for SVD feature generation
* Removes SVD-related functionality from ReshapeFeatureDistributionsStep
* Extracts utility functions to a new `tabpfn/preprocessing/steps/utils.py` module
1 change: 1 addition & 0 deletions changelog/768.deprecated.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Removes "scaler" as an option for `global_transformer_name` in `PreprocessorConfig`
1 change: 0 additions & 1 deletion src/tabpfn/preprocessing/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ class PreprocessorConfig:
max_features_per_estimator: int = 500
global_transformer_name: (
Literal[
"scaler",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What did scaler mean here?

"svd",
"svd_quarter_components",
]
Expand Down
48 changes: 29 additions & 19 deletions src/tabpfn/preprocessing/pipeline_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
from tabpfn.preprocessing.steps import (
AddFingerprintFeaturesStep,
AddSVDFeaturesStep,
DifferentiableZNormStep,
EncodeCategoricalFeaturesStep,
NanHandlingPolynomialFeaturesStep,
Expand Down Expand Up @@ -46,6 +47,7 @@ def create_preprocessing_pipeline(
"""Convert the ensemble configuration to a preprocessing pipeline."""
steps: list[PreprocessingStep | StepWithModalities] = []

pconfig = config.preprocess_config
use_poly_features, max_poly_features = _polynomial_feature_settings(
config.polynomial_features
)
Expand All @@ -59,28 +61,36 @@ def create_preprocessing_pipeline(

steps.append(RemoveConstantFeaturesStep())

if config.preprocess_config.differentiable:
if pconfig.differentiable:
steps.append(DifferentiableZNormStep())
else:
steps.extend(
[
ReshapeFeatureDistributionsStep(
transform_name=config.preprocess_config.name,
append_to_original=config.preprocess_config.append_original,
max_features_per_estimator=config.preprocess_config.max_features_per_estimator,
global_transformer_name=config.preprocess_config.global_transformer_name,
apply_to_categorical=(
config.preprocess_config.categorical_name == "numeric"
),
steps.append(
ReshapeFeatureDistributionsStep(
transform_name=pconfig.name,
append_to_original=pconfig.append_original,
max_features_per_estimator=pconfig.max_features_per_estimator,
apply_to_categorical=(pconfig.categorical_name == "numeric"),
random_state=random_state,
)
)

use_global_transformer = (
pconfig.global_transformer_name is not None
and pconfig.global_transformer_name != "None"
)
if use_global_transformer:
steps.append(
AddSVDFeaturesStep(
global_transformer_name=pconfig.global_transformer_name, # type: ignore
random_state=random_state,
),
(
EncodeCategoricalFeaturesStep(
config.preprocess_config.categorical_name,
random_state=random_state,
)
),
],
)
)

steps.append(
EncodeCategoricalFeaturesStep(
pconfig.categorical_name,
random_state=random_state,
)
)

if config.add_fingerprint_feature:
Expand Down
4 changes: 4 additions & 0 deletions src/tabpfn/preprocessing/steps/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from .add_fingerprint_features_step import (
AddFingerprintFeaturesStep,
)
from .add_svd_features_step import (
AddSVDFeaturesStep,
)
from .differentiable_z_norm_step import (
DifferentiableZNormStep,
)
Expand Down Expand Up @@ -31,6 +34,7 @@
__all__ = [
"AdaptiveQuantileTransformer",
"AddFingerprintFeaturesStep",
"AddSVDFeaturesStep",
"DifferentiableZNormStep",
"EncodeCategoricalFeaturesStep",
"KDITransformerWithNaN",
Expand Down
123 changes: 123 additions & 0 deletions src/tabpfn/preprocessing/steps/add_svd_features_step.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
"""Adds SVD features to the data."""

from __future__ import annotations

from typing import TYPE_CHECKING, Literal
from typing_extensions import override

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from tabpfn.preprocessing.datamodel import FeatureModality, FeatureSchema
from tabpfn.preprocessing.pipeline_interface import PreprocessingStep
from tabpfn.preprocessing.steps.utils import make_standard_scaler_safe
from tabpfn.utils import infer_random_state

if TYPE_CHECKING:
import numpy as np


class AddSVDFeaturesStep(PreprocessingStep):
"""Adds SVD features to the data."""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bejaeger can we elaborate slightly more the docstrings? What we do here is add, on top of the raw X, also more features that are just a compressed version of them? If so - I get it for numerical features, but a bit weird for categorical, and non-applicable for text?


def __init__(
self,
global_transformer_name: Literal[
"svd", "svd_quarter_components"
] = "svd_quarter_components",
random_state: int | np.random.Generator | None = None,
):
"""Initializes the AddSVDFeaturesStep."""
super().__init__()
self.global_transformer_name = global_transformer_name
self.random_state = random_state
self.is_no_op: bool = False

def num_added_features(self, n_samples: int, n_features: int) -> int:
"""Return the number of added features."""
if n_features < 2:
return 0

transformer = get_svd_features_transformer(
self.global_transformer_name,
n_samples,
n_features,
)
return next(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bejaeger I find this syntax hard to read. Why do we have next here? I'm confused

s[1].n_components
for s in transformer.steps
if isinstance(s[1], TruncatedSVD)
)

@override
def _fit(
self,
X: np.ndarray,
feature_schema: FeatureSchema,
) -> FeatureSchema:
n_samples, n_features = X.shape
if n_features < 2:
self.is_no_op = True
return feature_schema

static_seed, _ = infer_random_state(self.random_state)
transformer = get_svd_features_transformer(
self.global_transformer_name,
n_samples,
n_features,
random_state=static_seed,
)
transformer.fit(X)

self.transformer_ = transformer
self.feature_schema_updated_ = feature_schema

return feature_schema

@override
def _transform(
self, X: np.ndarray, *, is_test: bool = False
) -> tuple[np.ndarray, np.ndarray | None, FeatureModality | None]:
if self.is_no_op:
return X, None, None

assert self.feature_schema_updated_ is not None
assert self.transformer_ is not None

return X, self.transformer_.transform(X), FeatureModality.NUMERICAL


def get_svd_features_transformer(
global_transformer_name: Literal["svd", "svd_quarter_components"],
n_samples: int,
n_features: int,
random_state: int | None = None,
) -> Pipeline:
"""Returns a transformer to add SVD features to the data."""
if global_transformer_name == "svd":
divisor = 2
elif global_transformer_name == "svd_quarter_components":
divisor = 4
else:
raise ValueError(f"Invalid global transformer name: {global_transformer_name}.")

n_components = max(1, min(n_samples // 10 + 1, n_features // divisor))
return Pipeline(
steps=[
(
"save_standard",
make_standard_scaler_safe(
("standard", StandardScaler(with_mean=False)),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bejaeger at this step we expect many columns to already be normalized, right? and this is general is to learn a more balanced projection, but unrelated to the original features?

),
),
(
"svd",
TruncatedSVD(
algorithm="arpack",
n_components=n_components,
random_state=random_state,
),
),
],
)
Loading
Loading