-
-
Notifications
You must be signed in to change notification settings - Fork 25.9k
ENH add sparse output to SplineTransformer #24145
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
58fa259
7e4056b
457ebed
30dd697
29deb8c
8b57488
195773a
db687a0
85b23eb
4a1c7b1
17d427e
e4b010d
093531e
6dff136
8934b2b
aa650d7
b651044
fb4369c
24027f5
9e17b9f
07c0c4a
127d2b8
b9ddb9f
cde7e03
d680438
eccd567
61eaffe
638d33c
8986d30
4c10ab4
0ced11d
4814f62
190ff98
4a4d516
35b3a43
5bd297c
ac87516
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,11 +13,11 @@ | |
|
||
from ..base import BaseEstimator, TransformerMixin | ||
from ..utils import check_array | ||
from ..utils.fixes import sp_version, parse_version | ||
from ..utils.validation import check_is_fitted, FLOAT_DTYPES, _check_sample_weight | ||
from ..utils.validation import _check_feature_names_in | ||
from ..utils._param_validation import Interval, StrOptions | ||
from ..utils.stats import _weighted_percentile | ||
from ..utils.fixes import sp_version, parse_version | ||
|
||
from ._csr_polynomial_expansion import ( | ||
_csr_polynomial_expansion, | ||
|
@@ -574,8 +574,6 @@ def transform(self, X): | |
return XP | ||
|
||
|
||
# TODO: | ||
# - sparse support (either scipy or own cython solution)? | ||
class SplineTransformer(TransformerMixin, BaseEstimator): | ||
"""Generate univariate B-spline bases for features. | ||
|
||
|
@@ -635,8 +633,14 @@ class SplineTransformer(TransformerMixin, BaseEstimator): | |
i.e. a column of ones. It acts as an intercept term in a linear models. | ||
|
||
order : {'C', 'F'}, default='C' | ||
Order of output array. 'F' order is faster to compute, but may slow | ||
down subsequent estimators. | ||
Order of output array in the dense case. `'F'` order is faster to compute, but | ||
may slow down subsequent estimators. | ||
|
||
sparse_output : bool, default=False | ||
Will return sparse CSR matrix if set True else will return an array. This | ||
option is only available with `scipy>=1.8`. | ||
|
||
.. versionadded:: 1.2 | ||
|
||
Attributes | ||
---------- | ||
|
@@ -699,6 +703,7 @@ class SplineTransformer(TransformerMixin, BaseEstimator): | |
], | ||
"include_bias": ["boolean"], | ||
"order": [StrOptions({"C", "F"})], | ||
"sparse_output": ["boolean"], | ||
} | ||
|
||
def __init__( | ||
|
@@ -710,13 +715,15 @@ def __init__( | |
extrapolation="constant", | ||
include_bias=True, | ||
order="C", | ||
sparse_output=False, | ||
): | ||
self.n_knots = n_knots | ||
self.degree = degree | ||
self.knots = knots | ||
self.extrapolation = extrapolation | ||
self.include_bias = include_bias | ||
self.order = order | ||
self.sparse_output = sparse_output | ||
|
||
@staticmethod | ||
def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None): | ||
|
@@ -843,6 +850,12 @@ def fit(self, X, y=None, sample_weight=None): | |
elif not np.all(np.diff(base_knots, axis=0) > 0): | ||
raise ValueError("knots must be sorted without duplicates.") | ||
|
||
if self.sparse_output and sp_version < parse_version("1.8.0"): | ||
raise ValueError( | ||
"Option sparse_output=True is only available with scipy>=1.8.0, " | ||
f"but here scipy=={sp_version} is used." | ||
) | ||
|
||
# number of knots for base interval | ||
n_knots = base_knots.shape[0] | ||
|
||
|
@@ -934,7 +947,7 @@ def transform(self, X): | |
|
||
Returns | ||
------- | ||
XBS : ndarray of shape (n_samples, n_features * n_splines) | ||
XBS : {ndarray, sparse matrix} of shape (n_samples, n_features * n_splines) | ||
The matrix of features, where n_splines is the number of bases | ||
elements of the B-splines, n_knots + degree - 1. | ||
""" | ||
|
@@ -946,14 +959,30 @@ def transform(self, X): | |
n_splines = self.bsplines_[0].c.shape[1] | ||
degree = self.degree | ||
|
||
# TODO: Remove this condition, once scipy 1.10 is the minimum version. | ||
# Only scipy => 1.10 supports design_matrix(.., extrapolate=..). | ||
# The default (implicit in scipy < 1.10) is extrapolate=False. | ||
scipy_1_10 = sp_version >= parse_version("1.10.0") | ||
# Note: self.bsplines_[0].extrapolate is True for extrapolation in | ||
# ["periodic", "continue"] | ||
if scipy_1_10: | ||
use_sparse = self.sparse_output | ||
kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate} | ||
else: | ||
use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate | ||
kwargs_extrapolate = dict() | ||
|
||
# Note that scipy BSpline returns float64 arrays and converts input | ||
# x=X[:, i] to c-contiguous float64. | ||
n_out = self.n_features_out_ + n_features * (1 - self.include_bias) | ||
if X.dtype in FLOAT_DTYPES: | ||
dtype = X.dtype | ||
else: | ||
dtype = np.float64 | ||
XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order) | ||
if use_sparse: | ||
output_list = [] | ||
else: | ||
XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order) | ||
|
||
for i in range(n_features): | ||
spl = self.bsplines_[i] | ||
|
@@ -972,20 +1001,53 @@ def transform(self, X): | |
else: | ||
x = X[:, i] | ||
|
||
XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x) | ||
|
||
else: | ||
xmin = spl.t[degree] | ||
xmax = spl.t[-degree - 1] | ||
if use_sparse: | ||
XBS_sparse = BSpline.design_matrix( | ||
x, spl.t, spl.k, **kwargs_extrapolate | ||
) | ||
if self.extrapolation == "periodic": | ||
# See the construction of coef in fit. We need to add the last | ||
# degree spline basis function to the first degree ones and | ||
# then drop the last ones. | ||
# Note: See comment about SparseEfficiencyWarning below. | ||
XBS_sparse = XBS_sparse.tolil() | ||
XBS_sparse[:, :degree] += XBS_sparse[:, -degree:] | ||
XBS_sparse = XBS_sparse[:, :-degree] | ||
else: | ||
XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x) | ||
else: # extrapolation in ("constant", "linear") | ||
xmin, xmax = spl.t[degree], spl.t[-degree - 1] | ||
# spline values at boundaries | ||
f_min, f_max = spl(xmin), spl(xmax) | ||
mask = (xmin <= X[:, i]) & (X[:, i] <= xmax) | ||
XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i]) | ||
if use_sparse: | ||
mask_inv = ~mask | ||
x = X[:, i].copy() | ||
# Set some arbitrary values outside boundary that will be reassigned | ||
# later. | ||
x[mask_inv] = spl.t[self.degree] | ||
XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k) | ||
# Note: Without converting to lil_matrix we would get: | ||
# scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity | ||
# structure of a csr_matrix is expensive. lil_matrix is more | ||
# efficient. | ||
jjerphan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if np.any(mask_inv): | ||
XBS_sparse = XBS_sparse.tolil() | ||
XBS_sparse[mask_inv, :] = 0 | ||
else: | ||
XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i]) | ||
|
||
# Note for extrapolation: | ||
# 'continue' is already returned as is by scipy BSplines | ||
if self.extrapolation == "error": | ||
# BSpline with extrapolate=False does not raise an error, but | ||
# output np.nan. | ||
if np.any(np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)])): | ||
# outputs np.nan. | ||
if (use_sparse and np.any(np.isnan(XBS_sparse.data))) or ( | ||
not use_sparse | ||
and np.any( | ||
np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)]) | ||
) | ||
): | ||
raise ValueError( | ||
"X contains values beyond the limits of the knots." | ||
) | ||
|
@@ -995,21 +1057,29 @@ def transform(self, X): | |
# Only the first degree and last degree number of splines | ||
# have non-zero values at the boundaries. | ||
|
||
# spline values at boundaries | ||
f_min = spl(xmin) | ||
f_max = spl(xmax) | ||
mask = X[:, i] < xmin | ||
if np.any(mask): | ||
XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[ | ||
:degree | ||
] | ||
if use_sparse: | ||
# Note: See comment about SparseEfficiencyWarning above. | ||
XBS_sparse = XBS_sparse.tolil() | ||
XBS_sparse[mask, :degree] = f_min[:degree] | ||
|
||
else: | ||
XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[ | ||
:degree | ||
] | ||
|
||
mask = X[:, i] > xmax | ||
if np.any(mask): | ||
XBS[ | ||
mask, | ||
((i + 1) * n_splines - degree) : ((i + 1) * n_splines), | ||
] = f_max[-degree:] | ||
if use_sparse: | ||
# Note: See comment about SparseEfficiencyWarning above. | ||
XBS_sparse = XBS_sparse.tolil() | ||
XBS_sparse[mask, -degree:] = f_max[-degree:] | ||
else: | ||
XBS[ | ||
mask, | ||
((i + 1) * n_splines - degree) : ((i + 1) * n_splines), | ||
] = f_max[-degree:] | ||
|
||
elif self.extrapolation == "linear": | ||
# Continue the degree first and degree last spline bases | ||
|
@@ -1018,8 +1088,6 @@ def transform(self, X): | |
# Note that all others have derivative = value = 0 at the | ||
# boundaries. | ||
|
||
# spline values at boundaries | ||
f_min, f_max = spl(xmin), spl(xmax) | ||
# spline derivatives = slopes at boundaries | ||
fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1) | ||
# Compute the linear continuation. | ||
|
@@ -1030,16 +1098,57 @@ def transform(self, X): | |
for j in range(degree): | ||
mask = X[:, i] < xmin | ||
if np.any(mask): | ||
XBS[mask, i * n_splines + j] = ( | ||
f_min[j] + (X[mask, i] - xmin) * fp_min[j] | ||
) | ||
linear_extr = f_min[j] + (X[mask, i] - xmin) * fp_min[j] | ||
if use_sparse: | ||
# Note: See comment about SparseEfficiencyWarning above. | ||
XBS_sparse = XBS_sparse.tolil() | ||
XBS_sparse[mask, j] = linear_extr | ||
else: | ||
XBS[mask, i * n_splines + j] = linear_extr | ||
|
||
mask = X[:, i] > xmax | ||
if np.any(mask): | ||
k = n_splines - 1 - j | ||
XBS[mask, i * n_splines + k] = ( | ||
f_max[k] + (X[mask, i] - xmax) * fp_max[k] | ||
) | ||
linear_extr = f_max[k] + (X[mask, i] - xmax) * fp_max[k] | ||
if use_sparse: | ||
# Note: See comment about SparseEfficiencyWarning above. | ||
XBS_sparse = XBS_sparse.tolil() | ||
XBS_sparse[mask, k : k + 1] = linear_extr[:, None] | ||
else: | ||
XBS[mask, i * n_splines + k] = linear_extr | ||
|
||
if use_sparse: | ||
if not sparse.isspmatrix_csr(XBS_sparse): | ||
XBS_sparse = XBS_sparse.tocsr() | ||
lorentzenchr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
output_list.append(XBS_sparse) | ||
|
||
if use_sparse: | ||
# TODO: Remove this conditional error when the minimum supported version of | ||
# SciPy is 1.9.2 | ||
# `scipy.sparse.hstack` breaks in scipy<1.9.2 | ||
# when `n_features_out_ > max_int32` | ||
max_int32 = np.iinfo(np.int32).max | ||
all_int32 = True | ||
for mat in output_list: | ||
all_int32 &= mat.indices.dtype == np.int32 | ||
jjerphan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if ( | ||
sp_version < parse_version("1.9.2") | ||
and self.n_features_out_ > max_int32 | ||
and all_int32 | ||
): | ||
raise ValueError( | ||
"In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" | ||
" produces negative columns when:\n1. The output shape contains" | ||
" `n_cols` too large to be represented by a 32bit signed" | ||
" integer.\n. All sub-matrices to be stacked have indices of" | ||
" dtype `np.int32`.\nTo avoid this error, either use a version" | ||
" of scipy `>=1.9.2` or alter the `SplineTransformer`" | ||
" transformer to produce fewer than 2^31 output features" | ||
) | ||
XBS = sparse.hstack(output_list) | ||
lorentzenchr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
elif self.sparse_output: | ||
# TODO: Remove ones scipy 1.10 is the minimum version. See comments above. | ||
XBS = sparse.csr_matrix(XBS) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment seems to imply that with scipy 1.10, the However, as far as I understand, we still have to go through this condition when Would there be a way to avoid the At the moment, sparse periodic extrapolation is more than 20x slower than its dense counterpart: In [1]: import numpy as np
...: from sklearn.preprocessing import SplineTransformer
...:
...: X = np.linspace([-1, -10, 100], [1, 10, 101], 10000)
...: extrapolation="periodic"
...: st_sparse = SplineTransformer(sparse_output=True, extrapolation=extrapolation).fit(X)
...: st_dense = SplineTransformer(extrapolation=extrapolation).fit(X)
In [2]: %timeit st_dense.transform(X)
1.29 ms ± 1.25 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
In [3]: %timeit st_sparse.transform(X)
64.1 ms ± 851 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) scipy version: 1.10.1 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is slower, yes, but this is not a performance regression as sparse output is a new feature and the default is dense output. And yes, with scipy >= 1.10, we can get rid of (at least most of) the lil-conversions as we can use |
||
|
||
if self.include_bias: | ||
return XBS | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is a bunch of sparse matrices being constructed in this implementation. For each feature:
What is the runtime for
transform
with sparse output compared to dense output?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's unfortunate. Having n_features=1 doesn't change the result.
But memory consumption should be better.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The bottleneck seems to be in scipy's
_bspl.pyx
function_make_design_matrix
. In contrast,evaluate_spline
does all loops explicitly. We might report this upstream.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@ev-br @egorz734 friendly ping
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure what's going on here TBH, but csr->lil->csr does sound expensive. Where is the bottleneck, what does the profiler say?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What's the impact of scipy/scipy#16840 on the scikit-learn benchmark in https://github.com/scikit-learn/scikit-learn/pull/24145/files#r940357974?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there scipy nightly builds to install? Otherwise, this is above my current time budget to benchmark.
Scipy 1.10 will not only speed up
BSpline.design_matrix
but also make it easier to implement spline extrapolation in scikit-learn, thereby further reducing runtime and memory consumption.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SciPy has nightly builds:
but their CI is failing and not updating the nightly builds. I opened MacPython/scipy-wheels#175 to track the SciPy issue.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With
scipy==1.10.0.dev0
, one gets improved runtime:There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On my laptop, I get (scipy 1.10.1)