From e39aafd289193df56cb7e254a4aa174d3dd2dcff Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 13:25:17 +0100
Subject: [PATCH 01/12] Adjust all algorithms to work with CuPy

Requirements:
* https://github.com/numpy/numpy/pull/13046
* https://github.com/cupy/cupy/pull/2079
---
 dask_glm/algorithms.py | 57 ++++++++++++++++++++++++++----------------
 dask_glm/utils.py      | 16 +++++++-----
 2 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/dask_glm/algorithms.py b/dask_glm/algorithms.py
index 63d7729..5ef06f7 100644
--- a/dask_glm/algorithms.py
+++ b/dask_glm/algorithms.py
@@ -11,6 +11,7 @@
 from scipy.optimize import fmin_l_bfgs_b
 
 
+from dask.array.utils import normalize_to_array
 from dask_glm.utils import dot, normalize
 from dask_glm.families import Logistic
 from dask_glm.regularizers import Regularizer
@@ -97,7 +98,7 @@ def gradient_descent(X, y, max_iter=100, tol=1e-14, family=Logistic, **kwargs):
     stepSize = 1.0
     recalcRate = 10
     backtrackMult = firstBacktrackMult
-    beta = np.zeros(p)
+    beta = np.zeros_like(X, shape=(p,))
 
     for k in range(max_iter):
         # how necessary is this recalculation?
@@ -161,7 +162,7 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
     """
     gradient, hessian = family.gradient, family.hessian
     n, p = X.shape
-    beta = np.zeros(p)  # always init to zeros?
+    beta = np.zeros_like(X, shape=(p,))  # always init to zeros?
     Xbeta = dot(X, beta)
 
     iter_count = 0
@@ -178,8 +179,10 @@ def newton(X, y, max_iter=50, tol=1e-8, family=Logistic, **kwargs):
 
         # should this be dask or numpy?
         # currently uses Python 3 specific syntax
-        step, _, _, _ = np.linalg.lstsq(hess, grad)
-        beta = (beta_old - step)
+        step, _, _, _ = np.linalg.lstsq(normalize_to_array(hess), normalize_to_array(grad))
+        step_like = np.empty_like(X, shape=step.shape)
+        step_like[:] = step
+        beta = (beta_old - step_like)
 
         iter_count += 1
 
@@ -225,14 +228,19 @@ def admm(X, y, regularizer='l1', lamduh=0.1, rho=1, over_relax=1,
     def create_local_gradient(func):
         @functools.wraps(func)
         def wrapped(beta, X, y, z, u, rho):
-            return func(beta, X, y) + rho * (beta - z + u)
+            beta_like = np.empty_like(X, shape=beta.shape)
+            beta_like[:] = beta
+            return normalize_to_array(func(beta_like, X, y) + rho *
+                                      (beta_like - z + u))
         return wrapped
 
     def create_local_f(func):
         @functools.wraps(func)
         def wrapped(beta, X, y, z, u, rho):
-            return func(beta, X, y) + (rho / 2) * np.dot(beta - z + u,
-                                                         beta - z + u)
+            beta_like = np.empty_like(X, shape=beta.shape)
+            beta_like[:] = beta
+            return normalize_to_array(func(beta_like, X, y) + (rho / 2) *
+                                      np.dot(beta_like - z + u, beta_like - z + u))
         return wrapped
 
     f = create_local_f(pointwise_loss)
@@ -252,9 +260,9 @@ def wrapped(beta, X, y, z, u, rho):
     else:
         yD = [y]
 
-    z = np.zeros(p)
-    u = np.array([np.zeros(p) for i in range(nchunks)])
-    betas = np.array([np.ones(p) for i in range(nchunks)])
+    z = np.zeros_like(X, shape=(p,))
+    u = np.stack([np.zeros_like(X, shape=(p,)) for i in range(nchunks)])
+    betas = np.stack([np.ones_like(X, shape=(p,)) for i in range(nchunks)])
 
     for k in range(max_iter):
 
@@ -262,13 +270,13 @@ def wrapped(beta, X, y, z, u, rho):
         new_betas = [delayed(local_update)(xx, yy, bb, z, uu, rho, f=f,
                                            fprime=fprime) for
                      xx, yy, bb, uu in zip(XD, yD, betas, u)]
-        new_betas = np.array(da.compute(*new_betas))
+        new_betas = np.stack(da.compute(*new_betas), axis=0)
 
         beta_hat = over_relax * new_betas + (1 - over_relax) * z
 
         #  z-update step
         zold = z.copy()
-        ztilde = np.mean(beta_hat + np.array(u), axis=0)
+        ztilde = np.mean(beta_hat + u, axis=0)
         z = regularizer.proximal_operator(ztilde, lamduh / (rho * nchunks))
 
         # u-update step
@@ -295,11 +303,14 @@ def local_update(X, y, beta, z, u, rho, f, fprime, solver=fmin_l_bfgs_b):
     u = u.ravel()
     z = z.ravel()
     solver_args = (X, y, z, u, rho)
-    beta, f, d = solver(f, beta, fprime=fprime, args=solver_args,
+    beta, f, d = solver(f, normalize_to_array(beta),
+                        fprime=fprime, args=solver_args,
                         maxiter=200,
                         maxfun=250)
 
-    return beta
+    beta_like = np.empty_like(X, shape=beta.shape)
+    beta_like[:] = beta
+    return beta_like
 
 
 @normalize
@@ -335,21 +346,25 @@ def lbfgs(X, y, regularizer=None, lamduh=1.0, max_iter=100, tol=1e-4,
         pointwise_gradient = regularizer.add_reg_grad(pointwise_gradient, lamduh)
 
     n, p = X.shape
-    beta0 = np.zeros(p)
+    beta0 = np.zeros_like(X, shape=(p,))
 
     def compute_loss_grad(beta, X, y):
-        loss_fn = pointwise_loss(beta, X, y)
-        gradient_fn = pointwise_gradient(beta, X, y)
+        beta_like = np.empty_like(X, shape=beta.shape)
+        beta_like[:] = beta
+        loss_fn = pointwise_loss(beta_like, X, y)
+        gradient_fn = pointwise_gradient(beta_like, X, y)
         loss, gradient = compute(loss_fn, gradient_fn)
-        return loss, gradient.copy()
+        return normalize_to_array(loss), normalize_to_array(gradient.copy())
 
     with dask.config.set(fuse_ave_width=0):  # optimizations slows this down
         beta, loss, info = fmin_l_bfgs_b(
-            compute_loss_grad, beta0, fprime=None,
+            compute_loss_grad, normalize_to_array(beta0), fprime=None,
             args=(X, y),
             iprint=(verbose > 0) - 1, pgtol=tol, maxiter=max_iter)
 
-    return beta
+    beta_like = np.empty_like(X, shape=beta.shape)
+    beta_like[:] = beta
+    return beta_like
 
 
 @normalize
@@ -384,7 +399,7 @@ def proximal_grad(X, y, regularizer='l1', lamduh=0.1, family=Logistic,
     stepSize = 1.0
     recalcRate = 10
     backtrackMult = firstBacktrackMult
-    beta = np.zeros(p)
+    beta = np.zeros_like(X, shape=(p,))
     regularizer = Regularizer.get(regularizer)
 
     for k in range(max_iter):
diff --git a/dask_glm/utils.py b/dask_glm/utils.py
index fd91460..2930824 100644
--- a/dask_glm/utils.py
+++ b/dask_glm/utils.py
@@ -5,6 +5,7 @@
 
 import dask.array as da
 import numpy as np
+import cupy
 from functools import wraps
 from multipledispatch import dispatch
 
@@ -21,7 +22,7 @@ def normalize_inputs(X, y, *args, **kwargs):
                 raise ValueError('Multiple constant columns detected!')
             mean[intercept_idx] = 0
             std[intercept_idx] = 1
-            mean = mean if len(intercept_idx[0]) else np.zeros(mean.shape)
+            mean = mean if len(intercept_idx[0]) else np.zeros_like(mean)
             Xn = (X - mean) / std
             out = algo(Xn, y, *args, **kwargs).copy()
             i_adj = np.sum(out * mean / std)
@@ -39,7 +40,7 @@ def sigmoid(x):
 
 @dispatch(object)
 def exp(A):
-    return A.exp()
+    return np.exp(A)
 
 
 @dispatch(float)
@@ -74,7 +75,7 @@ def absolute(A):
 
 @dispatch(object)
 def sign(A):
-    return A.sign()
+    return np.sign(A)
 
 
 @dispatch(np.ndarray)
@@ -89,7 +90,7 @@ def sign(A):
 
 @dispatch(object)
 def log1p(A):
-    return A.log1p()
+    return np.log1p(A)
 
 
 @dispatch(np.ndarray)
@@ -105,8 +106,7 @@ def log1p(A):
 @dispatch(object, object)
 def dot(A, B):
     x = max([A, B], key=lambda x: getattr(x, '__array_priority__', 0))
-    module = package_of(x)
-    return module.dot(A, B)
+    return np.dot(A, B)
 
 
 @dispatch(da.Array, np.ndarray)
@@ -140,6 +140,10 @@ def sum(A):
 def add_intercept(X):
     return np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)
 
+@dispatch(cupy.ndarray)
+def add_intercept(X):
+    return np.concatenate([X, np.ones_like(X, shape=(X.shape[0], 1))], axis=1)
+
 
 @dispatch(da.Array)
 def add_intercept(X):

From 4b0025747db6876121bec8051cdb947df5f578c1 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 14:31:10 +0100
Subject: [PATCH 02/12] CuPy Tests: add test_admm

---
 dask_glm/tests/cupy/test_admm.py | 62 ++++++++++++++++++++++++++++++++
 dask_glm/utils.py                |  5 +++
 2 files changed, 67 insertions(+)
 create mode 100644 dask_glm/tests/cupy/test_admm.py

diff --git a/dask_glm/tests/cupy/test_admm.py b/dask_glm/tests/cupy/test_admm.py
new file mode 100644
index 0000000..b7cdba4
--- /dev/null
+++ b/dask_glm/tests/cupy/test_admm.py
@@ -0,0 +1,62 @@
+import pytest
+
+from dask import persist
+import numpy as np
+import cupy
+
+from dask.array.utils import normalize_to_array
+from dask_glm.algorithms import admm, local_update
+from dask_glm.families import Logistic, Normal
+from dask_glm.regularizers import L1
+from dask_glm.utils import cupy_make_y
+
+
+@pytest.mark.parametrize('N', [1000, 10000])
+@pytest.mark.parametrize('beta',
+                         [cupy.array([-1.5, 3]),
+                          cupy.array([35, 2, 0, -3.2]),
+                          cupy.array([-1e-2, 1e-4, 1.0, 2e-3, -1.2])])
+@pytest.mark.parametrize('family', [Logistic, Normal])
+def test_local_update(N, beta, family):
+    M = beta.shape[0]
+    X = cupy.random.random((N, M))
+    y = cupy.random.random(N) > 0.4
+    u = cupy.zeros(M)
+    z = cupy.random.random(M)
+    rho = 1e7
+
+    def create_local_gradient(func):
+        def wrapped(beta, X, y, z, u, rho):
+            beta_like = np.empty_like(X, shape=beta.shape)
+            beta_like[:] = beta
+            return normalize_to_array(func(beta_like, X, y) + rho *
+                                      (beta_like - z + u))
+        return wrapped
+
+    def create_local_f(func):
+        def wrapped(beta, X, y, z, u, rho):
+            beta_like = np.empty_like(X, shape=beta.shape)
+            beta_like[:] = beta
+            return normalize_to_array(func(beta_like, X, y) + (rho / 2) *
+                                      np.dot(beta_like - z + u, beta_like - z + u))
+        return wrapped
+
+    f = create_local_f(family.pointwise_loss)
+    fprime = create_local_gradient(family.pointwise_gradient)
+
+    result = local_update(X, y, beta, z, u, rho, f=f, fprime=fprime)
+
+    assert np.allclose(result, z, atol=2e-3)
+
+
+@pytest.mark.parametrize('N', [1000, 10000])
+@pytest.mark.parametrize('p', [1, 5, 10])
+def test_admm_with_large_lamduh(N, p):
+    X = cupy.random.random((N, p))
+    beta = cupy.random.random(p)
+    y = cupy_make_y(X, beta=cupy.array(beta))
+
+    X, y = persist(X, y)
+    z = admm(X, y, regularizer=L1(), lamduh=1e5, rho=20, max_iter=500)
+
+    assert np.allclose(z, np.zeros(p), atol=1e-4)
diff --git a/dask_glm/utils.py b/dask_glm/utils.py
index 2930824..896dec6 100644
--- a/dask_glm/utils.py
+++ b/dask_glm/utils.py
@@ -164,6 +164,11 @@ def make_y(X, beta=np.array([1.5, -3]), chunks=2):
     y = da.random.random(z0.shape, chunks=z0.chunks) < sigmoid(z0)
     return y
 
+def cupy_make_y(X, beta=cupy.array([1.5, -3])):
+    n, p = X.shape
+    z0 = X.dot(beta)
+    y = cupy.random.random(z0.shape) < sigmoid(z0)
+    return y
 
 def mean_squared_error(y_true, y_pred):
     return ((y_true - y_pred) ** 2).mean()

From e96b326409337994f7b6bc9f725f53fea84de576 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 14:36:07 +0100
Subject: [PATCH 03/12] CuPy Tests: add test_algos_families

---
 dask_glm/tests/cupy/test_algos_families.py | 143 +++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 dask_glm/tests/cupy/test_algos_families.py

diff --git a/dask_glm/tests/cupy/test_algos_families.py b/dask_glm/tests/cupy/test_algos_families.py
new file mode 100644
index 0000000..bccc7b7
--- /dev/null
+++ b/dask_glm/tests/cupy/test_algos_families.py
@@ -0,0 +1,143 @@
+import pytest
+
+from dask import persist
+import numpy as np
+import cupy
+
+from dask_glm.algorithms import (newton, lbfgs, proximal_grad,
+                                 gradient_descent, admm)
+from dask_glm.families import Logistic, Normal, Poisson
+from dask_glm.regularizers import Regularizer
+from dask_glm.utils import sigmoid, cupy_make_y
+
+
+def add_l1(f, lam):
+    def wrapped(beta, X, y):
+        return f(beta, X, y) + lam * (np.abs(beta)).sum()
+    return wrapped
+
+
+def make_intercept_data(N, p, seed=20009):
+    '''Given the desired number of observations (N) and
+    the desired number of variables (p), creates
+    random logistic data to test on.'''
+
+    # set the seeds
+    cupy.random.seed(seed)
+
+    X = cupy.random.random((N, p + 1))
+    col_sums = X.sum(axis=0)
+    X = X / col_sums[None, :]
+    X[:, p] = 1
+    y = cupy_make_y(X, beta=cupy.random.random(p + 1))
+
+    return X, y
+
+
+@pytest.mark.parametrize('opt',
+                         [lbfgs,
+                          newton,
+                          gradient_descent])
+@pytest.mark.parametrize('N, p, seed',
+                         [(100, 2, 20009),
+                          (250, 12, 90210),
+                          (95, 6, 70605)])
+def test_methods(N, p, seed, opt):
+    X, y = make_intercept_data(N, p, seed=seed)
+    coefs = opt(X, y)
+    p = sigmoid(X.dot(coefs))
+
+    y_sum = y.sum()
+    p_sum = p.sum()
+    assert np.isclose(y_sum, p_sum, atol=1e-1)
+
+
+@pytest.mark.parametrize('func,kwargs', [
+    (newton, {'tol': 1e-5}),
+    (lbfgs, {'tol': 1e-8}),
+    (gradient_descent, {'tol': 1e-7}),
+])
+@pytest.mark.parametrize('N', [1000])
+@pytest.mark.parametrize('nchunks', [1, 10])
+@pytest.mark.parametrize('family', [Logistic, Normal, Poisson])
+def test_basic_unreg_descent(func, kwargs, N, nchunks, family):
+    beta = cupy.random.normal(size=2)
+    M = len(beta)
+    X = cupy.random.random((N, M))
+    y = cupy_make_y(X, beta=cupy.array(beta))
+
+    X, y = persist(X, y)
+
+    result = func(X, y, family=family, **kwargs)
+    test_vec = cupy.random.normal(size=2)
+
+    opt = family.pointwise_loss(result, X, y)
+    test_val = family.pointwise_loss(test_vec, X, y)
+
+    assert opt < test_val
+
+
+@pytest.mark.parametrize('func,kwargs', [
+    (admm, {'abstol': 1e-4}),
+    (proximal_grad, {'tol': 1e-7}),
+])
+@pytest.mark.parametrize('N', [1000])
+@pytest.mark.parametrize('nchunks', [1, 10])
+@pytest.mark.parametrize('family', [Logistic, Normal, Poisson])
+@pytest.mark.parametrize('lam', [0.01, 1.2, 4.05])
+@pytest.mark.parametrize('reg', [r() for r in Regularizer.__subclasses__()])
+def test_basic_reg_descent(func, kwargs, N, nchunks, family, lam, reg):
+    beta = cupy.random.normal(size=2)
+    M = len(beta)
+    X = cupy.random.random((N, M))
+    y = cupy_make_y(X, beta=cupy.array(beta))
+
+    X, y = persist(X, y)
+
+    result = func(X, y, family=family, lamduh=lam, regularizer=reg, **kwargs)
+    test_vec = cupy.random.normal(size=2)
+
+    f = reg.add_reg_f(family.pointwise_loss, lam)
+
+    opt = f(result, X, y)
+    test_val = f(test_vec, X, y)
+
+    assert opt < test_val
+
+
+@pytest.mark.parametrize('func,kwargs', [
+    (admm, {'max_iter': 2}),
+    (proximal_grad, {'max_iter': 2}),
+    (newton, {'max_iter': 2}),
+    (gradient_descent, {'max_iter': 2}),
+])
+def test_determinism(func, kwargs):
+    X, y = make_intercept_data(1000, 10)
+
+    a = func(X, y, **kwargs)
+    b = func(X, y, **kwargs)
+
+    assert (a == b).all()
+
+
+try:
+    from distributed import Client
+    from distributed.utils_test import cluster, loop  # flake8: noqa
+except ImportError:
+    pass
+else:
+    @pytest.mark.parametrize('func,kwargs', [
+        (admm, {'max_iter': 2}),
+        (proximal_grad, {'max_iter': 2}),
+        (newton, {'max_iter': 2}),
+        (gradient_descent, {'max_iter': 2}),
+    ])
+    def test_determinism_distributed(func, kwargs, loop):
+        with cluster() as (s, [a, b]):
+            with Client(s['address'], loop=loop) as c:
+                X, y = make_intercept_data(1000, 10)
+
+                a = func(X, y, **kwargs)
+                b = func(X, y, **kwargs)
+
+                assert (a == b).all()

From c2892df77f8868da535118c84e65f58a39ee905a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 14:56:43 +0100
Subject: [PATCH 04/12] CuPy Tests: add test_regularizers

---
 dask_glm/tests/cupy/test_regularizers.py | 130 +++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 dask_glm/tests/cupy/test_regularizers.py

diff --git a/dask_glm/tests/cupy/test_regularizers.py b/dask_glm/tests/cupy/test_regularizers.py
new file mode 100644
index 0000000..c2aed1b
--- /dev/null
+++ b/dask_glm/tests/cupy/test_regularizers.py
@@ -0,0 +1,130 @@
+import numpy as np
+import cupy
+import pytest
+import cupy.testing as cpt
+from dask_glm import regularizers as regs
+
+
+@pytest.mark.parametrize('beta,expected', [
+    (cupy.array([0, 0, 0]), 0),
+    (cupy.array([1, 2, 3]), 7)
+])
+def test_l2_function(beta, expected):
+    assert regs.L2().f(beta) == expected
+
+
+@pytest.mark.parametrize('beta', [
+    cupy.array([0, 0, 0]),
+    cupy.array([1, 2, 3])
+])
+def test_l2_gradient(beta):
+    cpt.assert_array_equal(regs.L2().gradient(beta), beta)
+
+
+@pytest.mark.parametrize('beta', [
+    cupy.array([0, 0, 0]),
+    cupy.array([1, 2, 3])
+])
+def test_l2_hessian(beta):
+    cpt.assert_array_equal(regs.L2().hessian(beta), np.eye(len(beta)))
+
+
+@pytest.mark.parametrize('beta,expected', [
+    (cupy.array([0, 0, 0]), cupy.array([0, 0, 0])),
+    (cupy.array([1, 2, 3]), cupy.array([0.5, 1, 1.5]))
+])
+def test_l2_proximal_operator(beta, expected):
+    cpt.assert_array_equal(regs.L2().proximal_operator(beta, 1), expected)
+
+
+@pytest.mark.parametrize('beta,expected', [
+    (cupy.array([0, 0, 0]), 0),
+    (cupy.array([-1, 2, 3]), 6)
+])
+def test_l1_function(beta, expected):
+    assert regs.L1().f(beta) == expected
+
+
+@pytest.mark.parametrize('beta,expected', [
+    (cupy.array([1, 2, 3]), cupy.array([1, 1, 1])),
+    (cupy.array([-1, 2, 3]), cupy.array([-1, 1, 1]))
+])
+def test_l1_gradient(beta, expected):
+    cpt.assert_array_equal(regs.L1().gradient(beta), expected)
+
+
+@pytest.mark.parametrize('beta', [
+    cupy.array([0.00000001, 1, 2]),
+    cupy.array([-0.00000001, 1, 2]),
+    cupy.array([0, 0, 0])
+])
+def test_l1_gradient_raises_near_zero(beta):
+    with pytest.raises(ValueError):
+        regs.L1().gradient(beta)
+
+
+def test_l1_hessian():
+    cpt.assert_array_equal(regs.L1().hessian(cupy.array([1, 2])),
+                           cupy.array([[0, 0], [0, 0]]))
+
+
+def test_l1_hessian_raises():
+    with pytest.raises(ValueError):
+        regs.L1().hessian(cupy.array([0, 0, 0]))
+
+
+@pytest.mark.parametrize('beta,expected', [
+    (cupy.array([0, 0, 0]), cupy.array([0, 0, 0])),
+    (cupy.array([1, 2, 3]), cupy.array([0, 1, 2]))
+])
+def test_l1_proximal_operator(beta, expected):
+    cpt.assert_array_equal(regs.L1().proximal_operator(beta, 1), expected)
+
+
+@pytest.mark.parametrize('beta,expected', [
+    (cupy.array([0, 0, 0]), 0),
+    (cupy.array([1, 2, 3]), 6.5)
+])
+def test_elastic_net_function(beta, expected):
+    assert regs.ElasticNet().f(beta) == expected
+
+
+def test_elastic_net_function_zero_weight_is_l2():
+    beta = cupy.array([1, 2, 3])
+    assert regs.ElasticNet(weight=0).f(beta) == regs.L2().f(beta)
+
+
+def test_elastic_net_function_zero_weight_is_l1():
+    beta = cupy.array([1, 2, 3])
+    assert regs.ElasticNet(weight=1).f(beta) == regs.L1().f(beta)
+
+
+def test_elastic_net_gradient():
+    beta = cupy.array([1, 2, 3])
+    cpt.assert_array_equal(regs.ElasticNet(weight=0.5).gradient(beta), cupy.array([1, 1.5, 2]))
+
+
+def test_elastic_net_gradient_zero_weight_is_l2():
+    beta = cupy.array([1, 2, 3])
+    cpt.assert_array_equal(regs.ElasticNet(weight=0).gradient(beta), regs.L2().gradient(beta))
+
+
+def test_elastic_net_gradient_zero_weight_is_l1():
+    beta = cupy.array([1, 2, 3])
+    cpt.assert_array_equal(regs.ElasticNet(weight=1).gradient(beta), regs.L1().gradient(beta))
+
+
+def test_elastic_net_hessian():
+    beta = cupy.array([1, 2, 3])
+    cpt.assert_array_equal(regs.ElasticNet(weight=0.5).hessian(beta),
+                           np.eye(len(beta)) * regs.ElasticNet().weight)
+
+
+def test_elastic_net_hessian_raises():
+    with pytest.raises(ValueError):
+        regs.ElasticNet(weight=0.5).hessian(cupy.array([0, 1, 2]))
+
+
+def test_elastic_net_proximal_operator():
+    beta = cupy.array([1, 2, 3])
+    cpt.assert_array_equal(regs.ElasticNet(weight=0.5).proximal_operator(beta, 1), beta)

From 9029cc6cd31d4c1814acbda8539418175c72cc17 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 15:00:24 +0100
Subject: [PATCH 05/12] CuPy Tests: add test_utils

---
 dask_glm/tests/cupy/test_utils.py | 59 +++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 dask_glm/tests/cupy/test_utils.py

diff --git a/dask_glm/tests/cupy/test_utils.py b/dask_glm/tests/cupy/test_utils.py
new file mode 100644
index 0000000..15b6228
--- /dev/null
+++ b/dask_glm/tests/cupy/test_utils.py
@@ -0,0 +1,59 @@
+import pytest
+import numpy as np
+import cupy
+import cupy.testing as cpt
+
+from dask_glm import utils
+from dask.array.utils import assert_eq
+
+
+def test_normalize_normalizes():
+    @utils.normalize
+    def do_nothing(X, y):
+        return cupy.array([0.0, 1.0, 2.0])
+    X = cupy.array([[1, 0, 0], [1, 2, 2]])
+    y = cupy.array([0, 1, 0])
+    res = do_nothing(X, y)
+    cpt.assert_array_equal(res, cupy.array([-3.0, 1.0, 2.0]))
+
+
+def test_normalize_doesnt_normalize():
+    @utils.normalize
+    def do_nothing(X, y):
+        return cupy.array([0.0, 1.0, 2.0])
+    X = cupy.array([[1, 0, 0], [1, 2, 2]])
+    y = cupy.array([0, 1, 0])
+    res = do_nothing(X, y, normalize=False)
+    cpt.assert_array_equal(res, cupy.array([0, 1, 2]))
+
+
+def test_normalize_normalizes_if_intercept_not_present():
+    @utils.normalize
+    def do_nothing(X, y):
+        return cupy.array([0.0, 1.0, 2.0])
+    X = cupy.array([[1, 0, 0], [3, 9.0, 2]])
+    y = cupy.array([0, 1, 0])
+    res = do_nothing(X, y)
+    cpt.assert_array_equal(res, cupy.array([0, 1 / 4.5, 2]))
+
+
+def test_normalize_raises_if_multiple_constants():
+    @utils.normalize
+    def do_nothing(X, y):
+        return cupy.array([0.0, 1.0, 2.0])
+    X = cupy.array([[1, 2, 3], [1, 2, 3]])
+    y = cupy.array([0, 1, 0])
+    with pytest.raises(ValueError):
+        res = do_nothing(X, y)
+
+
+def test_add_intercept():
+    X = cupy.zeros((4, 4))
+    result = utils.add_intercept(X)
+    expected = cupy.array([
+        [0, 0, 0, 0, 1],
+        [0, 0, 0, 0, 1],
+        [0, 0, 0, 0, 1],
+        [0, 0, 0, 0, 1],
+    ], dtype=X.dtype)
+    cpt.assert_array_equal(result, expected)

From 2aecf5f23bd44880d40300a9d6041dda0e7e571d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 17:03:20 +0100
Subject: [PATCH 06/12] Generalize add_intercept and remove CuPy specialization

---
 dask_glm/utils.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/dask_glm/utils.py b/dask_glm/utils.py
index 896dec6..a65706e 100644
--- a/dask_glm/utils.py
+++ b/dask_glm/utils.py
@@ -136,11 +136,7 @@ def sum(A):
     return A.sum()
 
 
-@dispatch(np.ndarray)
-def add_intercept(X):
-    return np.concatenate([X, np.ones((X.shape[0], 1))], axis=1)
-
-@dispatch(cupy.ndarray)
+@dispatch(object)
 def add_intercept(X):
     return np.concatenate([X, np.ones_like(X, shape=(X.shape[0], 1))], axis=1)
 

From fd4269ae67aef86e824a773e309eb46426a713d6 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 17:04:56 +0100
Subject: [PATCH 07/12] Move cupy_make_y() to dask_glm.tests.cupy.utils module

Also skip CuPy tests if module 'cupy' isn't found.
---
 dask_glm/tests/__init__.py                 | 0
 dask_glm/tests/cupy/__init__.py            | 0
 dask_glm/tests/cupy/test_admm.py           | 4 ++--
 dask_glm/tests/cupy/test_algos_families.py | 5 +++--
 dask_glm/tests/cupy/test_regularizers.py   | 5 +++--
 dask_glm/tests/cupy/test_utils.py          | 3 ++-
 dask_glm/tests/cupy/utils.py               | 8 ++++++++
 dask_glm/utils.py                          | 7 -------
 8 files changed, 18 insertions(+), 14 deletions(-)
 create mode 100644 dask_glm/tests/__init__.py
 create mode 100644 dask_glm/tests/cupy/__init__.py
 create mode 100644 dask_glm/tests/cupy/utils.py

diff --git a/dask_glm/tests/__init__.py b/dask_glm/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dask_glm/tests/cupy/__init__.py b/dask_glm/tests/cupy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dask_glm/tests/cupy/test_admm.py b/dask_glm/tests/cupy/test_admm.py
index b7cdba4..861f0fe 100644
--- a/dask_glm/tests/cupy/test_admm.py
+++ b/dask_glm/tests/cupy/test_admm.py
@@ -1,14 +1,14 @@
 import pytest
+cupy = pytest.importorskip('cupy')
 
 from dask import persist
 import numpy as np
-import cupy
 
 from dask.array.utils import normalize_to_array
 from dask_glm.algorithms import admm, local_update
 from dask_glm.families import Logistic, Normal
 from dask_glm.regularizers import L1
-from dask_glm.utils import cupy_make_y
+from dask_glm.tests.cupy.utils import cupy_make_y
 
 
 @pytest.mark.parametrize('N', [1000, 10000])
diff --git a/dask_glm/tests/cupy/test_algos_families.py b/dask_glm/tests/cupy/test_algos_families.py
index bccc7b7..0c7c33a 100644
--- a/dask_glm/tests/cupy/test_algos_families.py
+++ b/dask_glm/tests/cupy/test_algos_families.py
@@ -1,14 +1,15 @@
 import pytest
+cupy = pytest.importorskip('cupy')
 
 from dask import persist
 import numpy as np
-import cupy
 
 from dask_glm.algorithms import (newton, lbfgs, proximal_grad,
                                  gradient_descent, admm)
 from dask_glm.families import Logistic, Normal, Poisson
 from dask_glm.regularizers import Regularizer
-from dask_glm.utils import sigmoid, cupy_make_y
+from dask_glm.utils import sigmoid
+from dask_glm.tests.cupy.utils import cupy_make_y
 
 
 def add_l1(f, lam):
diff --git a/dask_glm/tests/cupy/test_regularizers.py b/dask_glm/tests/cupy/test_regularizers.py
index c2aed1b..fa76d7a 100644
--- a/dask_glm/tests/cupy/test_regularizers.py
+++ b/dask_glm/tests/cupy/test_regularizers.py
@@ -1,6 +1,7 @@
-import numpy as np
-import cupy
 import pytest
+cupy = pytest.importorskip('cupy')
+
+import numpy as np
 import cupy.testing as cpt
 from dask_glm import regularizers as regs
 
diff --git a/dask_glm/tests/cupy/test_utils.py b/dask_glm/tests/cupy/test_utils.py
index 15b6228..e89e978 100644
--- a/dask_glm/tests/cupy/test_utils.py
+++ b/dask_glm/tests/cupy/test_utils.py
@@ -1,6 +1,7 @@
 import pytest
+cupy = pytest.importorskip('cupy')
+
 import numpy as np
-import cupy
 import cupy.testing as cpt
 
 from dask_glm import utils
diff --git a/dask_glm/tests/cupy/utils.py b/dask_glm/tests/cupy/utils.py
new file mode 100644
index 0000000..de437c7
--- /dev/null
+++ b/dask_glm/tests/cupy/utils.py
@@ -0,0 +1,8 @@
+import cupy
+from dask_glm.utils import sigmoid
+
+def cupy_make_y(X, beta=cupy.array([1.5, -3])):
+    n, p = X.shape
+    z0 = X.dot(beta)
+    y = cupy.random.random(z0.shape) < sigmoid(z0)
+    return y
diff --git a/dask_glm/utils.py b/dask_glm/utils.py
index a65706e..ea2ed53 100644
--- a/dask_glm/utils.py
+++ b/dask_glm/utils.py
@@ -5,7 +5,6 @@
 
 import dask.array as da
 import numpy as np
-import cupy
 from functools import wraps
 from multipledispatch import dispatch
 
@@ -160,12 +159,6 @@ def make_y(X, beta=np.array([1.5, -3]), chunks=2):
     y = da.random.random(z0.shape, chunks=z0.chunks) < sigmoid(z0)
     return y
 
-def cupy_make_y(X, beta=cupy.array([1.5, -3])):
-    n, p = X.shape
-    z0 = X.dot(beta)
-    y = cupy.random.random(z0.shape) < sigmoid(z0)
-    return y
-
 def mean_squared_error(y_true, y_pred):
     return ((y_true - y_pred) ** 2).mean()
 

From f4a15b14e14a7acf0cc84dc6b8e45189132172f1 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 17:16:38 +0100
Subject: [PATCH 08/12] Replace dask_glm.utils dispatchers by the NumPy
 function

Functions replaced:
* abs()
* exp()
* log1p()
* sign()
---
 dask_glm/datasets.py         |  5 ++-
 dask_glm/estimators.py       |  5 +--
 dask_glm/families.py         | 13 +++----
 dask_glm/tests/test_utils.py |  4 +--
 dask_glm/utils.py            | 69 ++----------------------------------
 5 files changed, 16 insertions(+), 80 deletions(-)

diff --git a/dask_glm/datasets.py b/dask_glm/datasets.py
index e89dd4f..f4c986e 100644
--- a/dask_glm/datasets.py
+++ b/dask_glm/datasets.py
@@ -1,6 +1,5 @@
 import numpy as np
 import dask.array as da
-from dask_glm.utils import exp
 
 
 def make_classification(n_samples=1000, n_features=100, n_informative=2, scale=1.0,
@@ -40,7 +39,7 @@ def make_classification(n_samples=1000, n_features=100, n_informative=2, scale=1
     informative_idx = np.random.choice(n_features, n_informative)
     beta = (np.random.random(n_features) - 1) * scale
     z0 = X[:, informative_idx].dot(beta[informative_idx])
-    y = da.random.random(z0.shape, chunks=(chunksize,)) < 1 / (1 + da.exp(-z0))
+    y = da.random.random(z0.shape, chunks=(chunksize,)) < 1 / (1 + np.exp(-z0))
     return X, y
 
 
@@ -122,6 +121,6 @@ def make_poisson(n_samples=1000, n_features=100, n_informative=2, scale=1.0,
     informative_idx = np.random.choice(n_features, n_informative)
     beta = (np.random.random(n_features) - 1) * scale
     z0 = X[:, informative_idx].dot(beta[informative_idx])
-    rate = exp(z0)
+    rate = np.exp(z0)
     y = da.random.poisson(rate, size=1, chunks=(chunksize,))
     return X, y
diff --git a/dask_glm/estimators.py b/dask_glm/estimators.py
index 435f7e6..fa4215a 100644
--- a/dask_glm/estimators.py
+++ b/dask_glm/estimators.py
@@ -1,12 +1,13 @@
 """
 Models following scikit-learn's estimator API.
 """
+import numpy as np
 from sklearn.base import BaseEstimator
 
 from . import algorithms
 from . import families
 from .utils import (
-    sigmoid, dot, add_intercept, mean_squared_error, accuracy_score, exp,
+    sigmoid, dot, add_intercept, mean_squared_error, accuracy_score,
     poisson_deviance
 )
 
@@ -227,7 +228,7 @@ class PoissonRegression(_GLM):
 
     def predict(self, X):
         X_ = self._maybe_add_intercept(X)
-        return exp(dot(X_, self._coef))
+        return np.exp(dot(X_, self._coef))
 
     def get_deviance(self, X, y):
         return poisson_deviance(y, self.predict(X))
diff --git a/dask_glm/families.py b/dask_glm/families.py
index d03f0ff..a609938 100644
--- a/dask_glm/families.py
+++ b/dask_glm/families.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, print_function
 
-from dask_glm.utils import dot, exp, log1p, sigmoid
+from dask_glm.utils import dot, sigmoid
+import numpy as np
 
 
 class Logistic(object):
@@ -20,8 +21,8 @@ def loglike(Xbeta, y):
         Xbeta : array, shape (n_samples, n_features)
         y : array, shape (n_samples)
         """
-        enXbeta = exp(-Xbeta)
-        return (Xbeta + log1p(enXbeta)).sum() - dot(y, Xbeta)
+        enXbeta = np.exp(-Xbeta)
+        return (Xbeta + np.log1p(enXbeta)).sum() - dot(y, Xbeta)
 
     @staticmethod
     def pointwise_loss(beta, X, y):
@@ -92,7 +93,7 @@ class Poisson(object):
     """
     @staticmethod
     def loglike(Xbeta, y):
-        eXbeta = exp(Xbeta)
+        eXbeta = np.exp(Xbeta)
         yXbeta = y * Xbeta
         return (eXbeta - yXbeta).sum()
 
@@ -110,11 +111,11 @@ def pointwise_gradient(beta, X, y):
 
     @staticmethod
     def gradient(Xbeta, X, y):
-        eXbeta = exp(Xbeta)
+        eXbeta = np.exp(Xbeta)
         return dot(X.T, eXbeta - y)
 
     @staticmethod
     def hessian(Xbeta, X):
-        eXbeta = exp(Xbeta)
+        eXbeta = np.exp(Xbeta)
         x_diag_eXbeta = eXbeta[:, None] * X
         return dot(X.T, x_diag_eXbeta)
diff --git a/dask_glm/tests/test_utils.py b/dask_glm/tests/test_utils.py
index 813b2fd..9105c0e 100644
--- a/dask_glm/tests/test_utils.py
+++ b/dask_glm/tests/test_utils.py
@@ -75,6 +75,6 @@ def test_sparse():
     from sparse.utils import assert_eq
     x = sparse.COO({(0, 0): 1, (1, 2): 2, (2, 1): 3})
     y = x.todense()
-    assert utils.sum(x) == utils.sum(x.todense())
-    for func in [utils.sigmoid, utils.sum, utils.exp]:
+    assert np.sum(x) == np.sum(x.todense())
+    for func in [utils.sigmoid, np.sum, np.exp]:
         assert_eq(func(x), func(y))
diff --git a/dask_glm/utils.py b/dask_glm/utils.py
index ea2ed53..c588941 100644
--- a/dask_glm/utils.py
+++ b/dask_glm/utils.py
@@ -34,72 +34,7 @@ def normalize_inputs(X, y, *args, **kwargs):
 
 def sigmoid(x):
     """Sigmoid function of x."""
-    return 1 / (1 + exp(-x))
-
-
-@dispatch(object)
-def exp(A):
-    return np.exp(A)
-
-
-@dispatch(float)
-def exp(A):
-    return np.exp(A)
-
-
-@dispatch(np.ndarray)
-def exp(A):
-    return np.exp(A)
-
-
-@dispatch(da.Array)
-def exp(A):
-    return da.exp(A)
-
-
-@dispatch(object)
-def absolute(A):
-    return abs(A)
-
-
-@dispatch(np.ndarray)
-def absolute(A):
-    return np.absolute(A)
-
-
-@dispatch(da.Array)
-def absolute(A):
-    return da.absolute(A)
-
-
-@dispatch(object)
-def sign(A):
-    return np.sign(A)
-
-
-@dispatch(np.ndarray)
-def sign(A):
-    return np.sign(A)
-
-
-@dispatch(da.Array)
-def sign(A):
-    return da.sign(A)
-
-
-@dispatch(object)
-def log1p(A):
-    return np.log1p(A)
-
-
-@dispatch(np.ndarray)
-def log1p(A):
-    return np.log1p(A)
-
-
-@dispatch(da.Array)
-def log1p(A):
-    return da.log1p(A)
+    return 1 / (1 + np.exp(-x))
 
 
 @dispatch(object, object)
@@ -168,7 +103,7 @@ def accuracy_score(y_true, y_pred):
 
 
 def poisson_deviance(y_true, y_pred):
-    return 2 * (y_true * log1p(y_true / y_pred) - (y_true - y_pred)).sum()
+    return 2 * (y_true * np.log1p(y_true / y_pred) - (y_true - y_pred)).sum()
 
 
 try:

From c1ad677e563643a7f801ead1e657ee4a57618e10 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 4 Mar 2019 19:10:23 +0100
Subject: [PATCH 09/12] Remove dask_glm.utils.sum dispatcher

---
 dask_glm/utils.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/dask_glm/utils.py b/dask_glm/utils.py
index c588941..add96a1 100644
--- a/dask_glm/utils.py
+++ b/dask_glm/utils.py
@@ -65,11 +65,6 @@ def dot(A, B):
     return da.dot(A, B)
 
 
-@dispatch(object)
-def sum(A):
-    return A.sum()
-
-
 @dispatch(object)
 def add_intercept(X):
     return np.concatenate([X, np.ones_like(X, shape=(X.shape[0], 1))], axis=1)

From 9127afa0be2bae95a4128114c58ab6c99f005657 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 5 Mar 2019 17:39:01 +0100
Subject: [PATCH 10/12] Add CuPy datasets

---
 dask_glm/cupy/__init__.py |   0
 dask_glm/cupy/datasets.py | 114 ++++++++++++++++++++++++++++++++++++++
 setup.py                  |   2 +-
 3 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 dask_glm/cupy/__init__.py
 create mode 100644 dask_glm/cupy/datasets.py

diff --git a/dask_glm/cupy/__init__.py b/dask_glm/cupy/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/dask_glm/cupy/datasets.py b/dask_glm/cupy/datasets.py
new file mode 100644
index 0000000..e3c38d4
--- /dev/null
+++ b/dask_glm/cupy/datasets.py
@@ -0,0 +1,114 @@
+import numpy as np
+import cupy
+
+
+def make_classification(n_samples=1000, n_features=100, n_informative=2, scale=1.0):
+    """
+    Generate a dummy dataset for classification tasks.
+
+    Parameters
+    ----------
+    n_samples : int
+        number of rows in the output array
+    n_features : int
+        number of columns (features) in the output array
+    n_informative : int
+        number of features that are correlated with the outcome
+    scale : float
+        Scale the true coefficient array by this
+
+    Returns
+    -------
+    X : dask.array, size ``(n_samples, n_features)``
+    y : dask.array, size ``(n_samples,)``
+        boolean-valued array
+
+    Examples
+    --------
+    >>> X, y = make_classification()
+    >>> X.dtype, X.shape
+    (dtype('float64'), (1000, 100))
+    >>> y.dtype, y.shape
+    (dtype('bool'), (1000,))
+    """
+    X = cupy.random.normal(0, 1, size=(n_samples, n_features))
+    informative_idx = np.random.choice(n_features, n_informative)
+    beta = (cupy.random.random(n_features) - 1) * scale
+    z0 = X[:, informative_idx].dot(beta[informative_idx])
+    y = cupy.random.random(z0.shape) < 1 / (1 + np.exp(-z0))
+    return X, y
+
+
+def make_regression(n_samples=1000, n_features=100, n_informative=2, scale=1.0):
+    """
+    Generate a dummy dataset for regression tasks.
+
+    Parameters
+    ----------
+    n_samples : int
+        number of rows in the output array
+    n_features : int
+        number of columns (features) in the output array
+    n_informative : int
+        number of features that are correlated with the outcome
+    scale : float
+        Scale the true coefficient array by this
+
+    Returns
+    -------
+    X : dask.array, size ``(n_samples, n_features)``
+    y : dask.array, size ``(n_samples,)``
+        real-valued array
+
+    Examples
+    --------
+    >>> X, y = make_regression()
+    >>> X.dtype, X.shape
+    (dtype('float64'), (1000, 100))
+    >>> y.dtype, y.shape
+    (dtype('float64'), (1000,))
+    """
+    X = cupy.random.normal(0, 1, size=(n_samples, n_features))
+    informative_idx = cupy.random.choice(n_features, n_informative)
+    beta = (cupy.random.random(n_features) - 1) * scale
+    z0 = X[:, informative_idx].dot(beta[informative_idx])
+    y = cupy.random.random(z0.shape)
+    return X, y
+
+
+def make_poisson(n_samples=1000, n_features=100, n_informative=2, scale=1.0):
+    """
+    Generate a dummy dataset for modeling count data.
+
+    Parameters
+    ----------
+    n_samples : int
+        number of rows in the output array
+    n_features : int
+        number of columns (features) in the output array
+    n_informative : int
+        number of features that are correlated with the outcome
+    scale : float
+        Scale the true coefficient array by this
+
+    Returns
+    -------
+    X : dask.array, size ``(n_samples, n_features)``
+    y : dask.array, size ``(n_samples,)``
+        array of non-negative integer-valued data
+
+    Examples
+    --------
+    >>> X, y = make_poisson()
+    >>> X.dtype, X.shape 
+    (dtype('float64'), (1000, 100))
+    >>> y.dtype, y.shape
+    (dtype('int64'), (1000,))
+    """
+    X = cupy.random.normal(0, 1, size=(n_samples, n_features))
+    informative_idx = cupy.random.choice(n_features, n_informative)
+    beta = (cupy.random.random(n_features) - 1) * scale
+    z0 = X[:, informative_idx].dot(beta[informative_idx])
+    rate = np.exp(z0)
+    y = cupy.random.poisson(rate)
+    return X, y
diff --git a/setup.py b/setup.py
index 7a7870f..3bd2efb 100755
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
       maintainer_email='mrocklin@gmail.com',
       license='BSD',
       keywords='dask,glm',
-      packages=['dask_glm'],
+      packages=['dask_glm', 'dask_glm.cupy'],
       long_description=(open('README.rst').read() if exists('README.rst')
                         else ''),
       install_requires=list(open('requirements.txt').read().strip().split('\n')),

From 69813a044173d7addb9513631739d8fe707f8211 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 5 Mar 2019 21:27:41 +0100
Subject: [PATCH 11/12] Update estimators documentation with CuPy example

---
 docs/estimators.rst | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/docs/estimators.rst b/docs/estimators.rst
index 5d4c011..f20606a 100644
--- a/docs/estimators.rst
+++ b/docs/estimators.rst
@@ -35,3 +35,33 @@ and can score known observations with the ``.score`` method.
    array([False, False, False, True, ... True, False, True, True], dtype=bool)
 
 See the :ref:`api-reference` for more.
+
+
+CuPy
+----
+
+Similarly to Dask Array, we can use CuPy arrays to fit and predict data on a
+CUDA-capable GPU. From the example given in the previous sections, we only need
+a couple of trivial changes. First, we load the CuPy datasets:
+
+.. code-block:: python
+
+   >>> from dask_glm.datasets import make_classification
+
+Second, as CuPy does not defer computation, we remove the ``.compute()`` call
+after the predictor:
+
+.. code-block:: python
+
+   >>> lr.predict(X).compute()
+
+The full code should look like:
+
+.. code-block:: python
+
+   >>> from dask_glm.estimators import LogisticRegression
+   >>> from dask_glm.datasets import make_classification
+   >>> X, y = make_classification()
+   >>> lr = LogisticRegression()
+   >>> lr.fit(X, y)
+   >>> lr.predict(X).compute()

From 86e45b9ae9af5e1ef98f548bc04370ef9e86a9af Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 5 Mar 2019 21:38:27 +0100
Subject: [PATCH 12/12] Add benchmark notebooks for Dask and CuPy solver

---
 docs/examples/BenchmarkSolversCuPy.ipynb | 81 ++++++++++++++++++++++++
 docs/examples/BenchmarkSolversDask.ipynb | 81 ++++++++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 docs/examples/BenchmarkSolversCuPy.ipynb
 create mode 100644 docs/examples/BenchmarkSolversDask.ipynb

diff --git a/docs/examples/BenchmarkSolversCuPy.ipynb b/docs/examples/BenchmarkSolversCuPy.ipynb
new file mode 100644
index 0000000..e04c856
--- /dev/null
+++ b/docs/examples/BenchmarkSolversCuPy.ipynb
@@ -0,0 +1,81 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask_glm.estimators import LogisticRegression\n",
+    "from dask_glm.cupy.datasets import make_classification\n",
+    "import time\n",
+    "\n",
+    "X, y = make_classification()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "solvers = ['admm', 'lbfgs', 'newton', 'proximal_grad', 'gradient_descent']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Solver admm took 263.742 ms to fit and 1.146 ms to predict\n",
+      "Solver lbfgs took 13.935 ms to fit and 0.260 ms to predict\n",
+      "Solver newton took 22.480 ms to fit and 0.248 ms to predict\n",
+      "Solver proximal_grad took 11.700 ms to fit and 0.257 ms to predict\n",
+      "Solver gradient_descent took 14.580 ms to fit and 0.257 ms to predict\n"
+     ]
+    }
+   ],
+   "source": [
+    "for s in solvers:\n",
+    "    lr = LogisticRegression(solver=s)\n",
+    "\n",
+    "    start_fit = time.time()\n",
+    "    lr.fit(X, y)\n",
+    "    end_fit = time.time()\n",
+    "    lr.predict(X)\n",
+    "    end_predict = time.time()\n",
+    "\n",
+    "    fit_time = (end_fit - start_fit) * 1000.0\n",
+    "    predict_time = (end_predict - end_fit) * 1000.0\n",
+    "\n",
+    "    print(\"Solver %s took %0.3f ms to fit and %0.3f ms to predict\" % (s, fit_time,\n",
+    "        predict_time))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/examples/BenchmarkSolversDask.ipynb b/docs/examples/BenchmarkSolversDask.ipynb
new file mode 100644
index 0000000..88ec91b
--- /dev/null
+++ b/docs/examples/BenchmarkSolversDask.ipynb
@@ -0,0 +1,81 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dask_glm.estimators import LogisticRegression\n",
+    "from dask_glm.datasets import make_classification\n",
+    "import time\n",
+    "\n",
+    "X, y = make_classification()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "solvers = ['admm', 'lbfgs', 'newton', 'proximal_grad', 'gradient_descent']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Solver admm took 22444.307 ms to fit and 41.299 ms to predict\n",
+      "Solver lbfgs took 1561.047 ms to fit and 41.063 ms to predict\n",
+      "Solver newton took 785.150 ms to fit and 42.822 ms to predict\n",
+      "Solver proximal_grad took 1902.620 ms to fit and 45.349 ms to predict\n",
+      "Solver gradient_descent took 2483.603 ms to fit and 42.605 ms to predict\n"
+     ]
+    }
+   ],
+   "source": [
+    "for s in solvers:\n",
+    "    lr = LogisticRegression(solver=s)\n",
+    "\n",
+    "    start_fit = time.time()\n",
+    "    lr.fit(X, y)\n",
+    "    end_fit = time.time()\n",
+    "    lr.predict(X).compute()\n",
+    "    end_predict = time.time()\n",
+    "\n",
+    "    fit_time = (end_fit - start_fit) * 1000.0\n",
+    "    predict_time = (end_predict - end_fit) * 1000.0\n",
+    "\n",
+    "    print(\"Solver %s took %0.3f ms to fit and %0.3f ms to predict\" % (s, fit_time,\n",
+    "        predict_time))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}