diff --git a/pymc3/step_methods/hmc/base_hmc.py b/pymc3/step_methods/hmc/base_hmc.py index 0c594b09e8..da768ffd52 100644 --- a/pymc3/step_methods/hmc/base_hmc.py +++ b/pymc3/step_methods/hmc/base_hmc.py @@ -44,7 +44,7 @@ def __init__(self, vars=None, scaling=None, step_scale=0.25, is_cov=False, if isinstance(scaling, dict): scaling = guess_scaling(Point(scaling, model=model), model=model, vars=vars) - n = scaling.shape[0] + n = model.dict_to_array(model.test_point).shape[0] self.step_size = step_scale / (n ** 0.25) self.potential = quad_potential(scaling, is_cov, as_cov=False) diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py index 835e44174c..b4d62f6b24 100644 --- a/pymc3/step_methods/hmc/quadpotential.py +++ b/pymc3/step_methods/hmc/quadpotential.py @@ -1,15 +1,19 @@ from numpy import dot from numpy.random import normal import scipy.linalg +from scipy import linalg from theano.tensor import slinalg +import theano.tensor as tt +import theano from scipy.sparse import issparse +import itertools from pymc3.theanof import floatX import numpy as np __all__ = ['quad_potential', 'ElemWiseQuadPotential', 'QuadPotential', - 'QuadPotential_Inv', 'isquadpotential'] + 'QuadPotential_Inv', 'isquadpotential', 'QuadPotentialSample'] def quad_potential(C, is_cov, as_cov): @@ -18,7 +22,8 @@ def quad_potential(C, is_cov, as_cov): ---------- C : arraylike, 0 <= ndim <= 2 scaling matrix for the potential - vector treated as diagonal matrix + vector treated as diagonal matrix. + If C is already a potential, it is returned unchanged. is_cov : Boolean whether C is provided as a covariance matrix or hessian as_cov : Boolean @@ -29,6 +34,8 @@ def quad_potential(C, is_cov, as_cov): ------- q : Quadpotential """ + if isquadpotential(C): + return C if issparse(C): if not chol_available: @@ -135,6 +142,150 @@ def energy(self, x): __call__ = random + +def _solve_lanczos(V, alphas, betas, estimator=None): + e = np.zeros(len(alphas)) + e[0] = 1 + T = [alphas, betas] + # TODO Is there a way to avoid computing all eigenvectors? + eig, eigv = linalg.eig_banded(T, lower=True) + if estimator is None: + if np.any(eig < 0): + raise ValueError("Matrix is not positive definite") + eig = 1 / np.sqrt(eig) + else: + eig = estimator(eig) + f_T = eigv @ (eig * np.linalg.solve(eigv, e)) + return np.dot(np.transpose(V), f_T) + + +def _sample_mvn_lanczos(matprot, z, epsilon, maxiter=None, + verbose=False, estimator=None): + """Sample from a multivariate normal using matrix-free lanczos. + + Given a matrix :math:`A`, sample from :math:`N(0, A^{-1})`, using + only matrix vector products :math:`Ax`. This is done by approximating + :math:`A^{-1/2}z` for a :math:`z \sim N(0, I)` using a Lanczos process + as described in [1]. + + Parameters + ---------- + matprot : function(x) -> Ax + A function that computes the matrix-vector product :math:`Ax`. + z : ndarray + A standard normal distributed vector. + epsilon : float + Stop iteration, if the relative change of the estimate is below + `epsilon`. + maxiter : int, optional + Maximum number of iterations. + estimator : function, optional + A function that maps the eigenvalues of the krylov subspace to + what is supposed to be estimated. By default this is + `lambda eig: 1 / np.sqrt(eig)`. If you want to sample from + :math:`N(0, A)` instead of from :math:`N(0, A^{-1/2})`, you can + set this to `lambda eig: np.sqrt(eig)`. + + References + ---------- + [1] Chow, E., and Y. Saad. “Preconditioned Krylov Subspace Methods for + Sampling Multivariate Gaussian Distributions.” SIAM Journal on + Scientific Computing 36, no. 2 (January 1, 2014): A588–608. + doi:10.1137/130920587. + """ + n = len(z) + z_norm = np.linalg.norm(z) + z = z / z_norm + q = z.copy() + beta = 1 + + alphas = [] + betas = [] + V = [] + + for n in itertools.count(): + if maxiter is not None and n > maxiter: + raise ArithmeticError("Maximum number of iterations reached.") + v = q / beta + if n > 0: + q = matprot(v) - beta * V[-1] + else: + q = matprot(v) + + alpha = v @ q + q = q - alpha * v + beta = np.linalg.norm(q) + + alphas.append(alpha) + betas.append(beta) + V.append(v) + + y_new = _solve_lanczos(V, alphas, betas, estimator) + + if beta == 0: + return z_norm * y_new + + if n > 0: + error = np.linalg.norm(y_new - y_old) / np.linalg.norm(y_new) + if np.isnan(error): + raise ArithmeticError("Current error is nan.") + if verbose and n % 100 == 0: + print(error) + if error < epsilon: + return z_norm * y_new + y_old = y_new + + +class QuadPotentialSample: + def __init__(self, samples, lam=0.01, epsilon=1e-8, maxiter=None): + """Use posterior samples to estimate the covariance. + + Parameters + ---------- + samples : ndarray + An array of posterior samples with shape `(s, n)`, where + `s` is the number of samples and `n` the number of parameters. + lam : float + Regularization parameter. Use :math:`(1 - \lambda)\Sigma + + \lambda D` as covariance, where `\Sigma` is the sample covariance + and D is the diagonal matrix containing the sample variances. + epsilon : float, optional + Error tolerance used in the convergence criterion of the + lanczos process. + maxiter : int, optional + The maximum number of iterations in the lanczos process. + """ + self.k, self.n = samples.shape + self.lam = lam + self.samples = samples.astype(np.float64).copy() + self.samples[:] -= self.samples.mean(axis=0) + self.stds = self.samples.std(axis=0) + self.vars = self.stds ** 2 + self.epsilon = epsilon + self.maxiter = maxiter + + def velocity(self, x): + samples = theano.shared(floatX(np.copy(self.samples, 'C'))) + D = theano.shared(floatX(self.vars)) + Ax = tt.dot(samples, x) + Ax = tt.dot(samples.T, Ax) / (self.k - 1) + return floatX(1 - self.lam) * Ax + floatX(self.lam) * D * x + + def matvec(self, x): + Ax = self.samples.T.dot(self.samples.dot(x)) / (self.k - 1) + return (1 - self.lam) * Ax + self.lam * x * self.vars + + def energy(self, x): + return floatX(0.5) * x.dot(self.velocity(x)) + + def random(self): + b = normal(size=self.n) + # Use the diagonal matrix as preconditioner + matvec = lambda x: self.matvec(x / self.stds) / self.stds + sample = _sample_mvn_lanczos(matvec, b, self.epsilon, self.maxiter) + return floatX(sample / self.stds) + + try: import sksparse.cholmod as cholmod chol_available = True diff --git a/pymc3/tests/test_quadpotential.py b/pymc3/tests/test_quadpotential.py index 9a7a5da052..d8e5c158db 100644 --- a/pymc3/tests/test_quadpotential.py +++ b/pymc3/tests/test_quadpotential.py @@ -3,6 +3,7 @@ import scipy.sparse import theano.tensor as tt import theano +import numpy.testing as npt from pymc3.step_methods.hmc import quadpotential @@ -64,23 +65,35 @@ def test_equal_diag(): x_ = np.random.randn(5) x = tt.vector() x.tag.test_value = x_ + + samples = np.random.randn(100000, 5) * np.sqrt(diag) + pots = [ - quadpotential.quad_potential(diag, False, False), - quadpotential.quad_potential(1. / diag, True, False), - quadpotential.quad_potential(np.diag(diag), False, False), - quadpotential.quad_potential(np.diag(1. / diag), True, False), + quadpotential.quad_potential(diag, True, False), + quadpotential.quad_potential(1. / diag, False, False), + quadpotential.quad_potential(np.diag(diag), True, False), + quadpotential.quad_potential(np.diag(1. / diag), False, False), ] if quadpotential.chol_available: - diag_ = scipy.sparse.csc_matrix(np.diag(1. / diag)) + diag_ = scipy.sparse.csc_matrix(np.diag(diag)) pots.append(quadpotential.quad_potential(diag_, True, False)) - v = np.diag(1. / diag).dot(x_) - e = x_.dot(np.diag(1. / diag).dot(x_)) / 2 - for pot in pots: + pots.append(quadpotential.QuadPotentialSample(samples, 0)) + + v = np.diag(diag).dot(x_) + e = x_.dot(np.diag(diag).dot(x_)) / 2 + for pot in pots[:-1]: + v_function = theano.function([x], pot.velocity(x)) + e_function = theano.function([x], pot.energy(x)) + npt.assert_allclose(v_function(x_), v) + npt.assert_allclose(e_function(x_), e) + + # QuadPotentialSample is less accurate + for pot in pots[-1:]: v_function = theano.function([x], pot.velocity(x)) e_function = theano.function([x], pot.energy(x)) - assert np.allclose(v_function(x_), v) - assert np.allclose(e_function(x_), e) + npt.assert_allclose(v_function(x_), v, rtol=0.2) + npt.assert_allclose(e_function(x_), e, rtol=0.2) def test_equal_dense(): @@ -94,35 +107,51 @@ def test_equal_dense(): x_ = np.random.randn(5) x = tt.vector() x.tag.test_value = x_ + + samples = np.random.multivariate_normal(np.zeros_like(x_), cov, 1000) + pots = [ - quadpotential.quad_potential(cov, False, False), - quadpotential.quad_potential(inv, True, False), + quadpotential.quad_potential(cov, True, False), + quadpotential.quad_potential(inv, False, False), ] if quadpotential.chol_available: - pots.append(quadpotential.quad_potential(cov, False, False)) + pots.append(quadpotential.quad_potential(cov, True, False)) + + pots.append(quadpotential.QuadPotentialSample(samples, 0)) - v = np.linalg.solve(cov, x_) + v = np.dot(cov, x_) e = 0.5 * x_.dot(v) - for pot in pots: + for pot in pots[:-1]: v_function = theano.function([x], pot.velocity(x)) e_function = theano.function([x], pot.energy(x)) - assert np.allclose(v_function(x_), v) - assert np.allclose(e_function(x_), e) + npt.assert_allclose(v_function(x_), v) + npt.assert_allclose(e_function(x_), e) + + for pot in pots[:-1]: + v_function = theano.function([x], pot.velocity(x)) + e_function = theano.function([x], pot.energy(x)) + npt.assert_allclose(v_function(x_), v, rtol=0.5) + npt.assert_allclose(e_function(x_), e, rtol=0.5) def test_random_diag(): d = np.arange(10) + 1 np.random.seed(42) + + samples = np.random.randn(1000, 10) * np.sqrt(d) + pots = [ quadpotential.quad_potential(d, True, False), quadpotential.quad_potential(1./d, False, False), quadpotential.quad_potential(np.diag(d), True, False), quadpotential.quad_potential(np.diag(1./d), False, False), + quadpotential.QuadPotentialSample(samples, 0), ] if quadpotential.chol_available: d_ = scipy.sparse.csc_matrix(np.diag(d)) pot = quadpotential.quad_potential(d, True, False) pots.append(pot) + for pot in pots: vals = np.array([pot.random() for _ in range(1000)]) assert np.allclose(vals.std(0), np.sqrt(1./d), atol=0.1) @@ -137,13 +166,17 @@ def test_random_dense(): inv = np.linalg.inv(cov) assert np.allclose(inv.dot(cov), np.eye(5)) + samples = np.random.multivariate_normal(np.zeros(5), cov, 1000) + pots = [ quadpotential.QuadPotential(cov), quadpotential.QuadPotential_Inv(inv), + quadpotential.QuadPotentialSample(samples, 0), ] if quadpotential.chol_available: pot = quadpotential.QuadPotential_Sparse(scipy.sparse.csc_matrix(cov)) pots.append(pot) + for pot in pots: cov_ = np.cov(np.array([pot.random() for _ in range(1000)]).T) assert np.allclose(cov_, inv, atol=0.1)