-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implemented a CuPy based GPU version of scipy.optimize.lsq_linear's BVLS
algorithm. This is in the optimize folder and will be submitted to CuPy for inclusion at which point it will be removed from Redrock. zscan has been updated to use this GPU based version instead of our NNLS trick of adding negative legendre terms and using NNLS to emulate BVLS; although that trick is preserved via self._solver_method = "bvls_via_nnls" in archetypes.py where Archetype files are allowed to specify their preferred solver method (pca, nnls, bvls, or bvls_via_nnls). This branch is forked from the branch legendre_nnls_merge_with_main, which was ready to merge with main and had addressed conflicts that had arisen during parallel developement by myself and Abhijeet. zfind is modified slightly to add a timing print statement for the time spent in each solver method. Timing Results: With 4 GPUs and 4 CPUs, BVLS on GPU takes ~17s resulting in a fine redshift scan time of 72s compared to the NNLS trick taking ~14s for a fine redshift scan time of 76s (the reason the fitting is faster but the overall time is slower is due to other operations performed on larger arrays with the added negative legendre terms). This compares to using scipy's CPU-based BVLS, which takes ~63s to do the fitting and 119s for the fine redshift scan.
- Loading branch information
1 parent
425d3fc
commit 382f6c0
Showing
7 changed files
with
798 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
"""This module contains a GPU implementation of the BVLS algorithm in | ||
scipy.optimize.lsq_linear""" | ||
from .gpu_lsq_linear import gpu_lsq_linear | ||
|
||
__all__ = ['gpu_lsq_linear'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,244 @@ | ||
"""GPU implementation of Bounded-variable least-squares algorithm.""" | ||
import numpy as np | ||
import cupy as cp | ||
#from numpy.linalg import norm, lstsq | ||
from cupy.linalg import norm, lstsq | ||
from scipy.optimize import OptimizeResult | ||
|
||
from .gpu_common import print_header_linear, print_iteration_linear, compute_grad | ||
|
||
def compute_kkt_optimality(g, on_bound): | ||
"""Compute the maximum violation of KKT conditions.""" | ||
g_kkt = g * on_bound | ||
free_set = on_bound == 0 | ||
g_kkt[free_set] = cp.abs(g[free_set]) | ||
return cp.max(g_kkt, axis=1) | ||
|
||
|
||
def bvls(A, b, x_lsq, lb, ub, ib, tol, max_iter, verbose, rcond=None, return_cost=True, return_optimality=True): | ||
d, m, n = A.shape | ||
|
||
#x = x_lsq.copy() | ||
x = x_lsq | ||
on_bound = cp.zeros((d,n), dtype=cp.int32) | ||
|
||
mask = x <= lb | ||
x[mask] = lb[mask] | ||
on_bound[mask] = -1 | ||
|
||
mask = x >= ub | ||
x[mask] = ub[mask] | ||
on_bound[mask] = 1 | ||
|
||
free_set = on_bound == 0 | ||
active_set = ~free_set | ||
|
||
if return_cost or return_optimality: | ||
r = cp.einsum("ijk,ik->ij", A, x) - b | ||
#r = A.dot(x) - b | ||
cost = 0.5*cp.einsum("ij,ij->i", r, r) | ||
#cost = 0.5 * np.dot(r, r) | ||
initial_cost = cost | ||
if return_optimality: | ||
g = compute_grad(A,r) | ||
#g = A.T.dot(r) | ||
else: | ||
cost = None | ||
initial_cost = None | ||
|
||
cost_change = None | ||
step_norm = None | ||
iteration = 0 | ||
|
||
if verbose == 2: | ||
print_header_linear() | ||
|
||
# This is the initialization loop. The requirement is that the | ||
# least-squares solution on free variables is feasible before BVLS starts. | ||
# One possible initialization is to set all variables to lower or upper | ||
# bounds, but many iterations may be required from this state later on. | ||
# The implemented ad-hoc procedure which intuitively should give a better | ||
# initial state: find the least-squares solution on current free variables, | ||
# if its feasible then stop, otherwise, set violating variables to | ||
# corresponding bounds and continue on the reduced set of free variables. | ||
|
||
while free_set.size > 0: | ||
if verbose == 2 and return_optimality: | ||
optimality = compute_kkt_optimality(g, on_bound) | ||
print_iteration_linear(iteration, cost, cost_change, step_norm, | ||
optimality) | ||
|
||
iteration += 1 | ||
if verbose == 2: | ||
#Only need x_old if verbose == 2 | ||
x_old = x.copy() | ||
|
||
b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set) | ||
A_free = A.swapaxes(1,2).copy() | ||
A_free[active_set] = 0 | ||
A_free = A_free.swapaxes(1,2) | ||
z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free) | ||
|
||
#A_free = A[:, free_set] | ||
#b_free = b - A.dot(x * active_set) | ||
#z = lstsq(A_free, b_free, rcond=rcond)[0] | ||
|
||
lbv = z < lb | ||
ubv = z > ub | ||
#lbv = z < lb[free_set] | ||
#ubv = z > ub[free_set] | ||
v = lbv | ubv | ||
|
||
if cp.any(lbv): | ||
#ind = free_set[lbv] | ||
#x[ind] = lb[ind] | ||
x[lbv] = lb[lbv] | ||
#active_set[ind] = True | ||
active_set[lbv] = True | ||
#on_bound[ind] = -1 | ||
on_bound[lbv] = -1 | ||
|
||
if cp.any(ubv): | ||
#ind = free_set[ubv] | ||
#x[ind] = ub[ind] | ||
x[ubv] = ub[ubv] | ||
#active_set[ind] = True | ||
active_set[ubv] = True | ||
#on_bound[ind] = 1 | ||
on_bound[ubv] = 1 | ||
|
||
#ind = free_set[~v] | ||
#x[ind] = z[~v] | ||
x[~v] = z[~v] | ||
|
||
r = cp.einsum("ijk,ik->ij", A, x) - b | ||
#r = A.dot(x) - b | ||
#cost_new = 0.5 * np.dot(r, r) | ||
cost_new = 0.5*cp.einsum("ij,ij->i", r, r) | ||
if return_cost: | ||
cost_change = cost - cost_new | ||
cost = cost_new | ||
#g = A.T.dot(r) | ||
g = compute_grad(A,r) | ||
if verbose == 2: | ||
#step_norm = norm(x[free_set] - x_free_old) | ||
step_norm = norm(x-x_old, axis=1) | ||
|
||
if cp.any(v): | ||
free_set = free_set[~v] | ||
else: | ||
break | ||
|
||
if max_iter is None: | ||
max_iter = n | ||
max_iter += iteration | ||
|
||
termination_status = None | ||
|
||
# Main BVLS loop. | ||
optimality = compute_kkt_optimality(g, on_bound) | ||
idx = np.arange(d, dtype=np.int32) | ||
for iteration in range(iteration, max_iter): # BVLS Loop A | ||
if verbose == 2 and return_optimality: | ||
print_iteration_linear(iteration, cost, cost_change, | ||
step_norm, optimality) | ||
|
||
if cp.all(optimality < tol): | ||
termination_status = 1 | ||
|
||
if termination_status is not None: | ||
break | ||
|
||
move_to_free = cp.argmax(g * on_bound, axis=1) | ||
on_bound[idx,move_to_free] = 0 | ||
|
||
while True: # BVLS Loop B | ||
free_set = on_bound == 0 | ||
active_set = ~free_set | ||
|
||
if verbose == 2: | ||
#Only need x_old if verbose == 2 | ||
x_old = x.copy() | ||
|
||
b_free = b - cp.einsum("ijk,ik->ij", A, x*active_set) | ||
#Set active_set to 0 in A_free | ||
A_free = A.swapaxes(1,2).copy() | ||
A_free[active_set] = 0 | ||
A_free = A_free.swapaxes(1,2) | ||
|
||
z = cp.einsum("ijk,ik->ij", cp.linalg.pinv(A_free), b_free) | ||
|
||
#A_free = A[:, free_set] | ||
#b_free = b - A.dot(x * active_set) | ||
#z = lstsq(A_free, b_free, rcond=rcond)[0] | ||
|
||
lbv = z < lb | ||
ubv = z > ub | ||
v = (lbv | ubv)*free_set | ||
#lbv, = np.nonzero(z < lb_free) | ||
#ubv, = np.nonzero(z > ub_free) | ||
#v = np.hstack((lbv, ubv)) | ||
|
||
#if v.size > 0: | ||
if v.any(): | ||
alphas = cp.zeros(x.shape) | ||
alphas[lbv] = (lb[lbv]-x[lbv]) / (z[lbv]-x[lbv]) | ||
alphas[ubv] = (ub[ubv]-x[ubv]) / (z[ubv]-x[ubv]) | ||
|
||
i = cp.argmin(alphas, axis=1) | ||
alpha = alphas[idx,i] | ||
x_free = x[v] * (1-alpha[v]) | ||
x_free += alpha[v]*z[v] | ||
x[v] = x_free | ||
|
||
#Update on_bound | ||
on_bound[cp.where(lbv[idx,i]), i[cp.where(lbv[idx,i])]] = -1 | ||
on_bound[cp.where(ubv[idx,i]), i[cp.where(ubv[idx,i])]] = 1 | ||
|
||
#alphas = np.hstack(( | ||
# lb_free[lbv] - x_free[lbv], | ||
# ub_free[ubv] - x_free[ubv])) / (z[v] - x_free[v]) | ||
|
||
#i = np.argmin(alphas) | ||
#i_free = v[i] | ||
#alpha = alphas[i] | ||
|
||
#x_free *= 1 - alpha | ||
#x_free += alpha * z | ||
#x[free_set] = x_free | ||
|
||
#if i < lbv.size: | ||
# on_bound[free_set[i_free]] = -1 | ||
#else: | ||
# on_bound[free_set[i_free]] = 1 | ||
else: | ||
x_free = z[free_set] | ||
x[free_set] = x_free | ||
break | ||
|
||
if verbose == 2: | ||
#step_norm = norm(x_free - x_free_old, axis=1) | ||
step_norm = norm(x - x_old, axis=1) | ||
|
||
r = cp.einsum("ijk,ik->ij", A, x) - b | ||
#r = A.dot(x) - b | ||
#cost_new = 0.5 * np.dot(r, r) | ||
cost_new = 0.5*cp.einsum("ij,ij->i", r, r) | ||
cost_change = cost - cost_new | ||
|
||
if cp.all(cost_change < tol * cost): | ||
termination_status = 2 | ||
cost = cost_new | ||
|
||
#g = A.T.dot(r) | ||
g = compute_grad(A,r) | ||
optimality = compute_kkt_optimality(g, on_bound) | ||
x_lsq = x | ||
|
||
if termination_status is None: | ||
termination_status = 0 | ||
|
||
return OptimizeResult( | ||
x=x_lsq, fun=r, cost=cost, optimality=optimality, active_mask=on_bound, | ||
nit=iteration + 1, status=termination_status, | ||
initial_cost=initial_cost) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
"""Functions used by least-squares algorithms.""" | ||
import numpy as np | ||
import cupy as cp | ||
|
||
from scipy.sparse.linalg import LinearOperator, aslinearoperator | ||
|
||
|
||
# Utility functions to work with bound constraints. | ||
def in_bounds(x, lb, ub, arrtype=None): | ||
"""Check if a point lies within bounds.""" | ||
if arrtype is None: | ||
arrtype = cp | ||
return arrtype.all((x >= lb) & (x <= ub),axis=1) | ||
|
||
def all_in_bounds(x, lb, ub): | ||
"""Check if a point lies within bounds.""" | ||
return cp.all((x >= lb) & (x <= ub)) | ||
|
||
|
||
# Functions to display algorithm's progress. | ||
|
||
|
||
def print_header_nonlinear(): | ||
print("{:^15}{:^15}{:^15}{:^15}{:^15}{:^15}" | ||
.format("Iteration", "Total nfev", "Cost", "Cost reduction", | ||
"Step norm", "Optimality")) | ||
|
||
|
||
def print_iteration_nonlinear(iteration, nfev, cost, cost_reduction, | ||
step_norm, optimality): | ||
if cost_reduction is None: | ||
cost_reduction = " " * 15 | ||
else: | ||
cost_reduction = f"{cost_reduction:^15.2e}" | ||
|
||
if step_norm is None: | ||
step_norm = " " * 15 | ||
else: | ||
step_norm = f"{step_norm:^15.2e}" | ||
|
||
print("{:^15}{:^15}{:^15.4e}{}{}{:^15.2e}" | ||
.format(iteration, nfev, cost, cost_reduction, | ||
step_norm, optimality)) | ||
|
||
|
||
def print_header_linear(): | ||
print("{:^15}{:^15}{:^15}{:^15}{:^15}" | ||
.format("Iteration", "Cost", "Cost reduction", "Step norm", | ||
"Optimality")) | ||
|
||
|
||
def print_iteration_linear(iteration, cost, cost_reduction, step_norm, | ||
optimality): | ||
if cost_reduction is None: | ||
cost_reduction = " " * 15 | ||
else: | ||
cost_reduction = f"{cost_reduction:^15.2e}" | ||
|
||
if step_norm is None: | ||
step_norm = " " * 15 | ||
else: | ||
step_norm = f"{step_norm:^15.2e}" | ||
|
||
print(f"{iteration:^15}{cost:^15.4e}{cost_reduction}{step_norm}{optimality:^15.2e}") | ||
|
||
|
||
# Simple helper functions. | ||
|
||
|
||
def compute_grad(J, f): | ||
"""Compute gradient of the least-squares cost function.""" | ||
if isinstance(J, LinearOperator): | ||
return J.rmatvec(f) | ||
else: | ||
return cp.einsum("ikj,ik->ij", J.swapaxes(1,2), f) | ||
#return J.T.dot(f) | ||
|
Oops, something went wrong.