JuliaGaussianProcesses · willtebbutt · Nov 16, 2021 · Nov 5, 2021 · Nov 5, 2021 · Nov 5, 2021
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "ApproximateGPs"
 uuid = "298c2ebc-0411-48ad-af38-99e88101b606"
 authors = ["JuliaGaussianProcesses Team"]
-version = "0.2.0"
+version = "0.2.1"
 
 [deps]
 AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"

diff --git a/docs/src/userguide.md b/docs/src/userguide.md
@@ -28,22 +28,46 @@ To construct a sparse approximation to the exact posterior, we first need to sel
 M = 15  # The number of inducing points
 z = x[1:M]
 ```
-The inducing inputs `z` imply some latent function values `u = f(z)`, sometimes called pseudo-points. The stochastic variational Gaussian process (SVGP) approximation is defined by a variational distribution `q(u)` over the pseudo-points. In the case of GP regression, the optimal form for `q(u)` is a multivariate Gaussian, which is the only form of `q` currently supported by this package.
+The inducing inputs `z` imply some latent function values `u = f(z)`, sometimes called pseudo-points. The `SparseVariationalApproximation` specifies a distribution `q(u)` over the pseudo-points. In the case of GP regression, the optimal form for `q(u)` is a multivariate Gaussian, which is the only form of `q` currently supported by this package.
 ```julia
 using Distributions, LinearAlgebra
 q = MvNormal(zeros(length(z)), I)
 ```
 Finally, we pass our `q` along with the inputs `f(z)` to obtain an approximate posterior GP:
 ```julia
 fz = f(z, 1e-6)  # 'observe' the process at z with some jitter for numerical stability 
-approx = SVGP(fz, q)  # Instantiate everything needed for the svgp approximation
+approx = SparseVariationalApproximation(fz, q)  # Instantiate everything needed for the approximation
 
-svgp_posterior = posterior(approx)  # Create the approximate posterior
+sva_posterior = posterior(approx)  # Create the approximate posterior
 ```
 
 ## The Evidence Lower Bound (ELBO)
 The approximate posterior constructed above will be a very poor approximation, since `q` was simply chosen to have zero mean and covariance `I`. A measure of the quality of the approximation is given by the ELBO. Optimising this term with respect to the parameters of `q` and the inducing input locations `z` will improve the approximation.
 ```julia
-elbo(SVGP(fz, q), fx, y)
+elbo(SparseVariationalApproximation(fz, q), fx, y)
 ```
 A detailed example of how to carry out such optimisation is given in [Regression: Sparse Variational Gaussian Process for Stochastic Optimisation with Flux.jl](@ref). For an example of non-conjugate inference, see [Classification: Sparse Variational Approximation for Non-Conjugate Likelihoods with Optim's L-BFGS](@ref).
+
+# Available Parametrisations
+
+Two parametrisations of `q(u)` are presently available: centred and non-centred.
+The centred parametrisation expresses `q(u)` directly in terms of its mean and covariance.
+The non-centred parametrisation instead parametrises the mean and covariance of
+`ε := cholesky(cov(u)).U' \ (u - mean(u))`.
+
+The choice of parametrisation can have a substantial impact on the time it takes for ELBO
+optimisation to converge, and which parametrisation is better in a particular situation is
+not generally obvious.
+That being said, the non-centred parametrisation is often superior, so it is the default --
+it is what is used in all of the examples above.
+
+If you require a particular parametrisation, simply use the 3-argument version of the
+approximation constructor:
+```julia
+SparseVariationalApproximation(Centred(), fz, q)
+SparseVariationalApproximation(NonCentred(), fz, q)
+```
+
+For a discussion around these two parametrisations, see e.g. [^Gorinova]
+
+[^Gorinova]: Gorinova, Maria and Moore, Dave and Hoffman, Matthew [Automatic Reparameterisation of Probabilistic Programs](http://proceedings.mlr.press/v119/gorinova20a)
diff --git a/src/ApproximateGPs.jl b/src/ApproximateGPs.jl
@@ -13,9 +13,18 @@ using ChainRulesCore
 using FillArrays
 using KLDivergences
 
-using AbstractGPs: AbstractGP, FiniteGP, LatentFiniteGP, ApproxPosteriorGP, At_A, diag_At_A
-
-export SparseVariationalApproximation
+using AbstractGPs:
+    AbstractGP,
+    FiniteGP,
+    LatentFiniteGP,
+    ApproxPosteriorGP,
+    At_A,
+    diag_At_A,
+    Xt_A_X,
+    Xt_A_Y,
+    diag_Xt_A_X
+
+export SparseVariationalApproximation, Centred, NonCentred
 export DefaultQuadrature, Analytic, GaussHermite, MonteCarlo
 
 include("utils.jl")

diff --git a/src/elbo.jl b/src/elbo.jl
@@ -1,5 +1,11 @@
 """
-    elbo(svgp::SparseVariationalApproximation, fx::FiniteGP, y::AbstractVector{<:Real}; num_data=length(y), quadrature=DefaultQuadrature())
+    elbo(
+        sva::SparseVariationalApproximation,
+        fx::FiniteGP,
+        y::AbstractVector{<:Real};
+        num_data=length(y),
+        quadrature=DefaultQuadrature(),
+    )
 
 Compute the Evidence Lower BOund from [1] for the process `f = fx.f ==
 svgp.fz.f` where `y` are observations of `fx`, pseudo-inputs are given by `z =
@@ -39,7 +45,13 @@ function AbstractGPs.elbo(
 end
 
 """
-    elbo(svgp, ::SparseVariationalApproximation, lfx::LatentFiniteGP, y::AbstractVector; num_data=length(y), quadrature=DefaultQuadrature())
+    elbo(
+        sva::SparseVariationalApproximation,
+        lfx::LatentFiniteGP,
+        y::AbstractVector;
+        num_data=length(y),
+        quadrature=DefaultQuadrature(),
+    )
 
 Compute the ELBO for a LatentGP with a possibly non-conjugate likelihood.
 """
@@ -68,9 +80,14 @@ function _elbo(
     q_f = marginals(post(fx.x))
     variational_exp = expected_loglik(quadrature, y, q_f, lik)
 
-    kl_term = KL(sva.q, sva.fz)
-
     n_batch = length(y)
     scale = num_data / n_batch
-    return sum(variational_exp) * scale - kl_term
+    return sum(variational_exp) * scale - kl_term(sva, post)
+end
+
+kl_term(sva::SparseVariationalApproximation{Centred}, post) = KL(sva.q, sva.fz)
+
+function kl_term(sva::SparseVariationalApproximation{NonCentred}, post)
+    m_ε = mean(sva.q)
+    return (tr(cov(sva.q)) + m_ε'm_ε - length(m_ε) - logdet(post.data.C_ε)) / 2
 end
diff --git a/src/sparse_variational.jl b/src/sparse_variational.jl
@@ -1,16 +1,72 @@
+raw"""
+    Centred()
+
+Used in conjunction with `SparseVariationalApproximation`.
+States that the `q` field of [`SparseVariationalApproximation`](@ref) is to be interpreted
+directly as the approximate posterior over the pseudo-points.
+
+This is also known as the "unwhitened" parametrisation [1].
+
+See also [`NonCentred`](@ref).
+
+[1] - https://en.wikipedia.org/wiki/Whitening_transformation
 """
-    SparseVariationalApproximation(fz::FiniteGP, q::AbstractMvNormal)
+struct Centred end
+
+raw"""
+    NonCentred()
+
+Used in conjunction with `SparseVariationalApproximation`.
+States that the `q` field of [`SparseVariationalApproximation`](@ref) is to be interpreted
+as the approximate posterior over `cholesky(cov(u)).L \ (u - mean(u))`, where `u` are the
+pseudo-points.
+
+This is also known as the "whitened" parametrisation [1].
 
-Packages the prior over the pseudo-points, `fz`, and the approximate posterior at the
-pseudo-points, `q`, together into a single object.
+See also [`Centred`](@ref).
+
+[1] - https://en.wikipedia.org/wiki/Whitening_transformation
 """
-struct SparseVariationalApproximation{Tfz<:FiniteGP,Tq<:AbstractMvNormal}
+struct NonCentred end
+
+struct SparseVariationalApproximation{Parametrisation,Tfz<:FiniteGP,Tq<:AbstractMvNormal}
     fz::Tfz
     q::Tq
 end
 
 raw"""
-    posterior(sva::SparseVariationalApproximation)
+    SparseVariationalApproximation(::Parametrisation, fz::FiniteGP, q::AbstractMvNormal)
+
+Produce a `SparseVariationalApproximation{Parametrisation}`, which packages the prior over
+the pseudo-points, `fz`, and the approximate posterior at the pseudo-points, `q`, together
+into a single object.
+
+The `Parametrisation` determines the precise manner in which `q` and `fz` are interpreted.
+Existing parametrisations include [`Centred`](@ref) and [`NonCentred`](@ref).
+"""
+function SparseVariationalApproximation(
+    ::Parametrisation, fz::Tfz, q::Tq
+) where {Parametrisation,Tfz<:FiniteGP,Tq<:AbstractMvNormal}
+    return SparseVariationalApproximation{Parametrisation,Tfz,Tq}(fz, q)
+end
+
+"""
+    SparseVariationalApproximation(fz::FiniteGP, q::AbstractMvNormal)
+
+Packages the prior over the pseudo-points `fz`, and the approximate posterior at the
+pseudo-points, which is `mean(fz) + cholesky(cov(fz)).U' * ε`, `ε ∼ q`.
+
+Shorthand for
+```julia
+SparseVariationalApproximation(NonCentred(), fz, q)
+```
+"""
+function SparseVariationalApproximation(fz::FiniteGP, q::AbstractMvNormal)
+    return SparseVariationalApproximation(NonCentred(), fz, q)
+end
+
+raw"""
+    posterior(sva::SparseVariationalApproximation{Centred})
 
 Compute the approximate posterior [1] over the process `f =
 sva.fz.f`, given inducing inputs `z = sva.fz.x` and a variational
@@ -27,7 +83,7 @@ which can be found in closed form.
 variational Gaussian process classification." Artificial Intelligence and
 Statistics. PMLR, 2015.
 """
-function AbstractGPs.posterior(sva::SparseVariationalApproximation)
+function AbstractGPs.posterior(sva::SparseVariationalApproximation{Centred})
     q, fz = sva.q, sva.fz
     m, S = mean(q), _chol_cov(q)
     Kuu = _chol_cov(fz)
@@ -38,41 +94,41 @@ function AbstractGPs.posterior(sva::SparseVariationalApproximation)
 end
 
 function AbstractGPs.posterior(
-    sva::SparseVariationalApproximation, fx::FiniteGP, ::AbstractVector
+    sva::SparseVariationalApproximation, fx::FiniteGP, ::AbstractVector{<:Real}
 )
     @assert sva.fz.f === fx.f
     return posterior(sva)
 end
 
 #
-# Code below this point just implements the Internal AbstractGPs API.
+# Various methods implementing the Internal AbstractGPs API.
 # See AbstractGPs.jl API docs for more info.
 #
 
 function Statistics.mean(
-    f::ApproxPosteriorGP{<:SparseVariationalApproximation}, x::AbstractVector
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{Centred}}, x::AbstractVector
 )
     return mean(f.prior, x) + cov(f.prior, x, inducing_points(f)) * f.data.α
 end
 
 function Statistics.cov(
-    f::ApproxPosteriorGP{<:SparseVariationalApproximation}, x::AbstractVector
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{Centred}}, x::AbstractVector
 )
     Cux = cov(f.prior, inducing_points(f), x)
     D = f.data.Kuu.L \ Cux
     return cov(f.prior, x) - At_A(D) + At_A(f.data.B' * D)
 end
 
 function Statistics.var(
-    f::ApproxPosteriorGP{<:SparseVariationalApproximation}, x::AbstractVector
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{Centred}}, x::AbstractVector
 )
     Cux = cov(f.prior, inducing_points(f), x)
     D = f.data.Kuu.L \ Cux
     return var(f.prior, x) - diag_At_A(D) + diag_At_A(f.data.B' * D)
 end
 
 function Statistics.cov(
-    f::ApproxPosteriorGP{<:SparseVariationalApproximation},
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{Centred}},
     x::AbstractVector,
     y::AbstractVector,
 )
@@ -85,7 +141,7 @@ function Statistics.cov(
 end
 
 function StatsBase.mean_and_cov(
-    f::ApproxPosteriorGP{<:SparseVariationalApproximation}, x::AbstractVector
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{Centred}}, x::AbstractVector
 )
     Cux = cov(f.prior, inducing_points(f), x)
     D = f.data.Kuu.L \ Cux
@@ -95,7 +151,7 @@ function StatsBase.mean_and_cov(
 end
 
 function StatsBase.mean_and_var(
-    f::ApproxPosteriorGP{<:SparseVariationalApproximation}, x::AbstractVector
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{Centred}}, x::AbstractVector
 )
     Cux = cov(f.prior, inducing_points(f), x)
     D = f.data.Kuu.L \ Cux
@@ -104,6 +160,96 @@ function StatsBase.mean_and_var(
     return μ, Σ_diag
 end
 
+#
+# NonCentred parametrisation.
+#
+
+raw"""
+    posterior(sva::SparseVariationalApproximation{NonCentred})
+
+Compute the approximate posterior [1] over the process `f =
+sva.fz.f`, given inducing inputs `z = sva.fz.x` and a variational
+distribution over inducing points `sva.q` (which represents ``q(ε)``
+where `ε = cholesky(cov(fz)).U' \ (f(z) - mean(f(z)))`). The approximate posterior at test
+points ``x^*`` where ``f^* = f(x^*)`` is then given by:
+
+```math
+q(f^*) = \int p(f | ε) q(ε) du
+```
+which can be found in closed form.
+
+[1] - Hensman, James, Alexander Matthews, and Zoubin Ghahramani. "Scalable
+variational Gaussian process classification." Artificial Intelligence and
+Statistics. PMLR, 2015.
+"""
+function AbstractGPs.posterior(approx::SparseVariationalApproximation{NonCentred})
+    fz = approx.fz
+    data = (Cuu=_chol_cov(fz), C_ε=_chol_cov(approx.q))
+    return ApproxPosteriorGP(approx, fz.f, data)
+end
+
+#
+# Various methods implementing the Internal AbstractGPs API.
+# See AbstractGPs.jl API docs for more info.
+#
+
+# Produces a matrix that is consistently referred to as A in this file. A more descriptive
+# name is, unfortunately, not obvious. It's just an intermediate quantity that happens to
+# get used a lot.
+_A(f, x) = f.data.Cuu.U' \ cov(f.prior, inducing_points(f), x)
+
+function Statistics.mean(
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{NonCentred}}, x::AbstractVector
+)
+    return mean(f.prior, x) + _A(f, x)' * mean(f.approx.q)
+end
+
+function Statistics.cov(
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{NonCentred}}, x::AbstractVector
+)
+    A = _A(f, x)
+    return cov(f.prior, x) - At_A(A) + Xt_A_X(f.data.C_ε, A)
+end
+
+function Statistics.var(
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{NonCentred}}, x::AbstractVector
+)
+    A = _A(f, x)
+    return var(f.prior, x) - diag_At_A(A) + diag_Xt_A_X(f.data.C_ε, A)
+end
+
+function Statistics.cov(
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{NonCentred}},
+    x::AbstractVector,
+    y::AbstractVector,
+)
+    Ax = _A(f, x)
+    Ay = _A(f, y)
+    return cov(f.prior, x, y) - Ax'Ay + Xt_A_Y(Ax, f.data.C_ε, Ay)
+end
+
+function StatsBase.mean_and_cov(
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{NonCentred}}, x::AbstractVector
+)
+    A = _A(f, x)
+    μ = mean(f.prior, x) + A' * mean(f.approx.q)
+    Σ = cov(f.prior, x) - At_A(A) + Xt_A_X(f.data.C_ε, A)
+    return μ, Σ
+end
+
+function StatsBase.mean_and_var(
+    f::ApproxPosteriorGP{<:SparseVariationalApproximation{NonCentred}}, x::AbstractVector
+)
+    A = _A(f, x)
+    μ = mean(f.prior, x) + A' * mean(f.approx.q)
+    Σ = var(f.prior, x) - diag_At_A(A) + diag_Xt_A_X(f.data.C_ε, A)
+    return μ, Σ
+end
+
+#
+# Misc utility.
+#
+
 inducing_points(f::ApproxPosteriorGP{<:SparseVariationalApproximation}) = f.approx.fz.x
 
 _chol_cov(q::AbstractMvNormal) = cholesky(Symmetric(cov(q)))

diff --git a/test/elbo.jl b/test/elbo.jl
@@ -8,9 +8,9 @@
     f = GP(kernel)
     fx = f(x, 0.1)
     fz = f(z)
-    q_ex = exact_variational_posterior(fz, fx, y)
+    q_ex = optimal_variational_posterior(fz, fx, y)
 
-    sva = SparseVariationalApproximation(fz, q_ex)
+    sva = SparseVariationalApproximation(Centred(), fz, q_ex)
     @test elbo(sva, fx, y) isa Real
     @test elbo(sva, fx, y) ≤ logpdf(fx, y)