Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rebase changes from Quantco/glum#543 #869

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/glum/_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,7 @@ def __init__(
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
use_sparse_hessian: bool = True,
):
self.l1_ratio = l1_ratio
self.P1 = P1
Expand Down Expand Up @@ -795,6 +796,7 @@ def __init__(
self.categorical_format = categorical_format
self.cat_missing_method = cat_missing_method
self.cat_missing_name = cat_missing_name
self.use_sparse_hessian = use_sparse_hessian

@property
def family_instance(self) -> ExponentialDispersionModel:
Expand Down Expand Up @@ -1112,13 +1114,15 @@ def _solve(
lower_bounds=lower_bounds,
upper_bounds=upper_bounds,
verbose=self.verbose > 0,
use_sparse_hessian=self.use_sparse_hessian,
)
if self._solver == "irls-ls":
coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
_least_squares_solver, coef, irls_data
)
# 4.2 coordinate descent ##############################################
elif self._solver == "irls-cd":
# This is the case we're concerned with for wide problems.
coef, self.n_iter_, self._n_cycles, self.diagnostics_ = _irls_solver(
_cd_solver, coef, irls_data
)
Expand Down Expand Up @@ -2880,6 +2884,12 @@ class GeneralizedLinearRegressor(GeneralizedLinearRegressorBase):
Name of the category to which missing values will be converted if
``cat_missing_method='convert'``. Only used if ``X`` is a pandas data frame.

use_sparse_hessian : boolean, optional, (default=True)
If ``True``, stores the current state of the Hessian as a sparse COO matrix.
If ``False``, stores as a dense matrix of size `num_cols` x `num_cols`, where
`num_cols` is the number of columns in the data X. Use ``True`` to avoid memory
issues when working with extremely wide data.

Attributes
----------
coef_ : numpy.array, shape (n_features,)
Expand Down Expand Up @@ -2965,6 +2975,7 @@ def __init__(
categorical_format: str = "{name}[{category}]",
cat_missing_method: str = "fail",
cat_missing_name: str = "(MISSING)",
use_sparse_hessian: bool = True,
):
self.alphas = alphas
self.alpha = alpha
Expand Down Expand Up @@ -3005,6 +3016,7 @@ def __init__(
categorical_format=categorical_format,
cat_missing_method=cat_missing_method,
cat_missing_name=cat_missing_name,
use_sparse_hessian=use_sparse_hessian,
)

def _validate_hyperparameters(self) -> None:
Expand Down
68 changes: 58 additions & 10 deletions src/glum/_solvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ def update_hessian(state, data, active_set):
# 2) use all the rows
# 3) Include the P2 components
# 4) just like an update, we only update the active_set

hessian_init = build_hessian_delta(
data.X,
state.hessian_rows,
Expand All @@ -149,16 +150,33 @@ def update_hessian(state, data, active_set):
np.arange(data.X.shape[0], dtype=np.int32),
active_set,
)
state.hessian[np.ix_(active_set, active_set)] = hessian_init

# In the sparse Hessian case, state.hessian is a coo_matrix.
# hessian_init is np array of size active_set.shape[0] x active_set.shape[0];
# we can convert this to a coo_matrix and simply add it to the state.hessian
if data.use_sparse_hessian:
coo_data = hessian_init.ravel(order="C")
coo_rows = np.repeat(active_set, active_set.shape[0])
coo_cols = np.tile(active_set, active_set.shape[0])

state.hessian = sparse.coo_matrix(
(coo_data, (coo_rows, coo_cols)),
shape=(state.coef.shape[0], state.coef.shape[0]),
)
else:
state.hessian[np.ix_(active_set, active_set)] = hessian_init

state.hessian_initialized = True
n_active_rows = data.X.shape[0]

else:
# In an update iteration, we want to:
# 1) use the difference in hessian_rows from the last iteration
# 2) filter for active_rows in case data.hessian_approx != 0
# 3) Ignore the P2 components because those don't change and have
# already been added
# 4) only update the active set subset of the hessian.

hessian_rows_diff, active_rows = identify_active_rows(
state.gradient_rows,
state.hessian_rows,
Expand All @@ -173,13 +191,32 @@ def update_hessian(state, data, active_set):
active_rows=active_rows,
active_cols=active_set,
)
state.hessian[np.ix_(active_set, active_set)] += hessian_delta

if data.use_sparse_hessian:
coo_data = hessian_delta.ravel(order="C")
coo_rows = np.repeat(active_set, active_set.shape[0])
coo_cols = np.tile(active_set, active_set.shape[0])

state.hessian += sparse.coo_matrix(
(coo_data, (coo_rows, coo_cols)),
shape=(state.coef.shape[0], state.coef.shape[0]),
)
else:
state.hessian[np.ix_(active_set, active_set)] += hessian_delta

n_active_rows = active_rows.shape[0]

return (
state.hessian[np.ix_(active_set, active_set)],
n_active_rows,
)
# If state.hessian is in COO form, we have to convert to CSC in order to index the
# active set elements, which we then convert to a numpy array for compatibility with
# the rest of the code. This numpy array is dense, but we care about problems in
# which the active set is usually much smaller than the number of data columns.
if data.use_sparse_hessian:
return (
state.hessian.tocsc()[np.ix_(active_set, active_set)].toarray(),
n_active_rows,
)
else:
return state.hessian[np.ix_(active_set, active_set)], n_active_rows


def _is_subset(x, y):
Expand Down Expand Up @@ -268,7 +305,7 @@ def _irls_solver(inner_solver, coef, data) -> tuple[np.ndarray, int, int, list[l
----------
inner_solver
A least squares solver that can handle the appropriate penalties. With
an L1 penalty, this will _cd_solver. With only an L2 penalty,
an L1 penalty, this will be _cd_solver. With only an L2 penalty,
_least_squares_solver will be more efficient.
coef : ndarray, shape (c,)
If fit_intercept=False, shape c=X.shape[1].
Expand Down Expand Up @@ -432,6 +469,7 @@ def __init__(
lower_bounds: Optional[np.ndarray] = None,
upper_bounds: Optional[np.ndarray] = None,
verbose: bool = False,
use_sparse_hessian: bool = True,
):
self.X = X
self.y = y
Expand Down Expand Up @@ -463,6 +501,7 @@ def __init__(

self.intercept_offset = 1 if self.fit_intercept else 0
self.verbose = verbose
self.use_sparse_hessian = use_sparse_hessian

self._check_data()

Expand Down Expand Up @@ -525,9 +564,18 @@ def __init__(self, coef, data):
self.old_hessian_rows = np.zeros(data.X.shape[0], dtype=data.X.dtype)
self.gradient_rows = None
self.hessian_rows = None
self.hessian = np.zeros(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)

# In order to avoid memory issues for very wide datasets (e.g. Issue #485), we
# can instantiate the Hessian as a sparse COO matrix.
if data.use_sparse_hessian:
self.hessian = sparse.coo_matrix(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)
else:
self.hessian = np.zeros(
(self.coef.shape[0], self.coef.shape[0]), dtype=data.X.dtype
)

self.hessian_initialized = False
self.coef_P2 = None
self.norm_min_subgrad = None
Expand Down