Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make automatic centering in PCA methods optional #808

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions dask_ml/decomposition/incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,15 +132,21 @@ def __init__(
svd_solver="auto",
iterated_power=0,
random_state=None,
center=True,
):
self.n_components = n_components
self.whiten = whiten
self.center = center
self.copy = copy
self.batch_size = batch_size
self.svd_solver = svd_solver
self.iterated_power = iterated_power
self.random_state = random_state

def _check_params(self):
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
if self.center is False:
raise ValueError("IncrementalPCA with center=False is not supported.")

def _fit(self, X, y=None):
"""Fit the model with X, using minibatches of size batch_size.

Expand Down Expand Up @@ -238,6 +244,7 @@ def partial_fit(self, X, y=None, check_input=True):
self : object
Returns the instance itself.
"""
self._check_params()
if check_input:
if sparse.issparse(X):
raise TypeError(
Expand Down
82 changes: 65 additions & 17 deletions dask_ml/decomposition/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,21 @@ class PCA(sklearn.decomposition.PCA):
If None, the random number generator is the RandomState instance used
by `da.random`. Used when ``svd_solver`` == 'randomized'.

center : bool, optional (default True)
When True (the default), the underlying data gets centered at zero
by subtracting the mean of the data from the data itself.

PCA is performed on centered data due to its being a regression model,
without an intercept. As such, its principal components originate at the
origin of the transformed space.

``center=False`` may be employed when performing PCA on already
centered data.

Since centering is a required step as part of whitening, ``center`` set
to False and ``whiten`` set to True is a combination which may result in
unexpected behavior, if performed on not previously centered data.

Attributes
----------
components_ : array, shape (n_components, n_features)
Expand Down Expand Up @@ -152,18 +167,27 @@ class PCA(sklearn.decomposition.PCA):
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[ 6.30061... 0.54980...]
[6.30061232 0.54980396]

>>> pca = PCA(n_components=2, svd_solver='full')
>>> pca.fit(dX) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
svd_solver='full', tol=0.0, whiten=False)
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[ 0.99244... 0.00755...]
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[6.30061232 0.54980396]

>>> dX_mean_0 = dX - dX.mean(axis=0)
>>> pca = PCA(n_components=2, svd_solver='full', center=False)
>>> pca.fit(dX_mean_0)
PCA(center=False, n_components=2, svd_solver='full')
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
[0.99244289 0.00755711]
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
[ 6.30061... 0.54980...]
[6.30061232 0.54980396]

Notes
-----
Expand All @@ -175,6 +199,10 @@ class PCA(sklearn.decomposition.PCA):
``dask.linalg.svd_compressed``.
* n_components : ``n_components='mle'`` is not allowed.
Fractional ``n_components`` between 0 and 1 is not allowed.
* center : if ``True`` (the default), automatically center input data before
performing PCA.
Set this parameter to ``False``, if the input data have already been
centered before running ``fit()``.
"""

def __init__(
Expand All @@ -186,10 +214,12 @@ def __init__(
tol=0.0,
iterated_power=0,
random_state=None,
center=True,
):
self.n_components = n_components
self.copy = copy
self.whiten = whiten
self.center = center
self.svd_solver = svd_solver
self.tol = tol
self.iterated_power = iterated_power
Expand All @@ -198,6 +228,7 @@ def __init__(
def fit(self, X, y=None):
if not dask.is_dask_collection(X):
raise TypeError(_TYPE_MSG.format(type(X)))

self._fit(X)
self.n_features_in_ = X.shape[1]
return self
Expand Down Expand Up @@ -266,8 +297,10 @@ def _fit(self, X):

solver = self._get_solver(X, n_components)

self.mean_ = X.mean(0)
X -= self.mean_
self.mean_ = X.mean(axis=0)

if self.center:
X -= self.mean_

if solver in {"full", "tsqr"}:
U, S, V = da.linalg.svd(X)
Expand Down Expand Up @@ -370,14 +403,20 @@ def transform(self, X):
X_new : array-like, shape (n_samples, n_components)

"""
check_is_fitted(self, ["mean_", "components_"])
check_is_fitted(self, "components_")

if self.whiten:
check_is_fitted(self, "explained_variance_")

if self.center:
check_is_fitted(self, "mean_")
if self.mean_ is not None:
X -= self.mean_

# X = check_array(X)
if self.mean_ is not None:
X = X - self.mean_
X_transformed = da.dot(X, self.components_.T)
if self.whiten:
X_transformed /= np.sqrt(self.explained_variance_)

return X_transformed

def fit_transform(self, X, y=None):
Expand All @@ -396,7 +435,6 @@ def fit_transform(self, X, y=None):
X_new : array-like, shape (n_samples, n_components)

"""
# X = check_array(X)
if not dask.is_dask_collection(X):
raise TypeError(_TYPE_MSG.format(type(X)))
U, S, V = self._fit(X)
Expand Down Expand Up @@ -431,18 +469,25 @@ def inverse_transform(self, X):
If whitening is enabled, inverse_transform does not compute the
exact inverse operation of transform.
"""
check_is_fitted(self, "mean_")
check_is_fitted(self, "components_")

if self.center:
check_is_fitted(self, "mean_")
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
offset = self.mean_
else:
offset = 0

if self.whiten:
check_is_fitted(self, "explained_variance_")
return (
da.dot(
X,
np.sqrt(self.explained_variance_[:, np.newaxis]) * self.components_,
)
+ self.mean_
+ offset
)
else:
return da.dot(X, self.components_) + self.mean_

return da.dot(X, self.components_) + offset

def score_samples(self, X):
"""Return the log-likelihood of each sample.
Expand All @@ -463,8 +508,11 @@ def score_samples(self, X):
"""
check_is_fitted(self, "mean_")

# X = check_array(X)
Xr = X - self.mean_
if self.center:
Xr = X - self.mean_
else:
Xr = X

n_features = X.shape[1]
precision = self.get_precision() # [n_features, n_features]
log_like = -0.5 * (Xr * (da.dot(Xr, precision))).sum(axis=1)
Expand Down
17 changes: 17 additions & 0 deletions tests/test_incremental_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,3 +475,20 @@ def test_incremental_pca_partial_fit_float_division():
np.testing.assert_allclose(
singular_vals_float_samples_seen, singular_vals_int_samples_seen
)


def test_incremental_pca_no_centering_not_supported():
rng = np.random.RandomState(0)
A = rng.randn(5, 3) + 2
A = da.from_array(A, chunks=[3, -1])

pca = IncrementalPCA(n_components=2, center=False)

with pytest.raises(ValueError, match="not supported"):
pca.partial_fit(A)

with pytest.raises(ValueError, match="not supported"):
pca.fit(A)

with pytest.raises(ValueError, match="not supported"):
pca.fit_transform(A)
Loading