-
Notifications
You must be signed in to change notification settings - Fork 558
Refactor SVD features preprocessing step #768
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
06af2a0
d248833
4f71386
fb647fa
39e632e
a6044ab
f3e3ee3
ec14a6f
c579f38
00c481e
a336463
fd44cb3
f0a81c9
a2ec121
d55bf8a
36d3d2c
59af1f2
07c2dcd
ae216ba
3a5816e
f845980
280a1de
62114bd
ad853c1
57e5eef
20687cf
712f1be
1b1b81e
08be623
f55689d
8f51ac9
e67258a
0faade8
ba80aac
7c1180b
f114e0e
9a7a2b4
d1eeb07
8a4c482
e36f561
0d64858
8e4361e
e5a2ed8
650ff00
e731404
a6103e4
dbfaef3
af39010
b8f7a1c
17a7f07
0b96218
53b9941
6d94a4a
a5c9e40
4569471
5212211
d3f7ecf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| * Introduces AddSVDFeaturesStep as a dedicated preprocessing step for SVD feature generation | ||
| * Removes SVD-related functionality from ReshapeFeatureDistributionsStep | ||
| * Extracts utility functions to a new `tabpfn/preprocessing/steps/utils.py` module |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Removes "scaler" as an option for `global_transformer_name` in `PreprocessorConfig` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,123 @@ | ||
| """Adds SVD features to the data.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from typing import TYPE_CHECKING, Literal | ||
| from typing_extensions import override | ||
|
|
||
| from sklearn.decomposition import TruncatedSVD | ||
| from sklearn.pipeline import Pipeline | ||
| from sklearn.preprocessing import StandardScaler | ||
|
|
||
| from tabpfn.preprocessing.datamodel import FeatureModality, FeatureSchema | ||
| from tabpfn.preprocessing.pipeline_interface import PreprocessingStep | ||
| from tabpfn.preprocessing.steps.utils import make_standard_scaler_safe | ||
| from tabpfn.utils import infer_random_state | ||
|
|
||
| if TYPE_CHECKING: | ||
| import numpy as np | ||
|
|
||
|
|
||
| class AddSVDFeaturesStep(PreprocessingStep): | ||
| """Adds SVD features to the data.""" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bejaeger can we elaborate slightly more the docstrings? What we do here is add, on top of the raw X, also more features that are just a compressed version of them? If so - I get it for numerical features, but a bit weird for categorical, and non-applicable for text? |
||
|
|
||
| def __init__( | ||
| self, | ||
| global_transformer_name: Literal[ | ||
| "svd", "svd_quarter_components" | ||
| ] = "svd_quarter_components", | ||
| random_state: int | np.random.Generator | None = None, | ||
| ): | ||
| """Initializes the AddSVDFeaturesStep.""" | ||
| super().__init__() | ||
| self.global_transformer_name = global_transformer_name | ||
| self.random_state = random_state | ||
| self.is_no_op: bool = False | ||
|
|
||
| def num_added_features(self, n_samples: int, n_features: int) -> int: | ||
| """Return the number of added features.""" | ||
| if n_features < 2: | ||
| return 0 | ||
|
|
||
| transformer = get_svd_features_transformer( | ||
| self.global_transformer_name, | ||
| n_samples, | ||
| n_features, | ||
| ) | ||
| return next( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bejaeger I find this syntax hard to read. Why do we have next here? I'm confused |
||
| s[1].n_components | ||
| for s in transformer.steps | ||
| if isinstance(s[1], TruncatedSVD) | ||
| ) | ||
|
|
||
| @override | ||
| def _fit( | ||
| self, | ||
| X: np.ndarray, | ||
| feature_schema: FeatureSchema, | ||
| ) -> FeatureSchema: | ||
| n_samples, n_features = X.shape | ||
| if n_features < 2: | ||
| self.is_no_op = True | ||
| return feature_schema | ||
|
|
||
| static_seed, _ = infer_random_state(self.random_state) | ||
| transformer = get_svd_features_transformer( | ||
| self.global_transformer_name, | ||
| n_samples, | ||
| n_features, | ||
| random_state=static_seed, | ||
| ) | ||
| transformer.fit(X) | ||
|
|
||
| self.transformer_ = transformer | ||
| self.feature_schema_updated_ = feature_schema | ||
|
|
||
| return feature_schema | ||
|
|
||
| @override | ||
| def _transform( | ||
| self, X: np.ndarray, *, is_test: bool = False | ||
| ) -> tuple[np.ndarray, np.ndarray | None, FeatureModality | None]: | ||
| if self.is_no_op: | ||
| return X, None, None | ||
|
|
||
| assert self.feature_schema_updated_ is not None | ||
| assert self.transformer_ is not None | ||
|
|
||
| return X, self.transformer_.transform(X), FeatureModality.NUMERICAL | ||
|
|
||
|
|
||
| def get_svd_features_transformer( | ||
| global_transformer_name: Literal["svd", "svd_quarter_components"], | ||
| n_samples: int, | ||
| n_features: int, | ||
| random_state: int | None = None, | ||
| ) -> Pipeline: | ||
| """Returns a transformer to add SVD features to the data.""" | ||
| if global_transformer_name == "svd": | ||
| divisor = 2 | ||
| elif global_transformer_name == "svd_quarter_components": | ||
| divisor = 4 | ||
| else: | ||
| raise ValueError(f"Invalid global transformer name: {global_transformer_name}.") | ||
|
|
||
| n_components = max(1, min(n_samples // 10 + 1, n_features // divisor)) | ||
| return Pipeline( | ||
| steps=[ | ||
| ( | ||
| "save_standard", | ||
| make_standard_scaler_safe( | ||
| ("standard", StandardScaler(with_mean=False)), | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @bejaeger at this step we expect many columns to already be normalized, right? and this is general is to learn a more balanced projection, but unrelated to the original features? |
||
| ), | ||
| ), | ||
| ( | ||
| "svd", | ||
| TruncatedSVD( | ||
| algorithm="arpack", | ||
| n_components=n_components, | ||
| random_state=random_state, | ||
| ), | ||
| ), | ||
| ], | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What did scaler mean here?