diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 6c23a8b9..0b3ca692 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -10,6 +10,7 @@ jobs: fail-fast: false matrix: version: + - '1.6' - '1' os: - ubuntu-latest diff --git a/Project.toml b/Project.toml index a51fff30..bd071485 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,13 @@ name = "Pathfinder" uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454" authors = ["Seth Axen and contributors"] -version = "0.2.3" +version = "0.3.0-DEV" [deps] +AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +GalacticOptim = "a75be94c-b780-496d-a8a9-0878b188d577" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Optim = "429524aa-4258-5aef-a3af-852621145aeb" PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150" @@ -15,17 +18,21 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" [compat] +AbstractDifferentiation = "0.4" Distributions = "0.25" +ForwardDiff = "0.10" +GalacticOptim = "2" Optim = "1.4" PDMats = "0.11" PSIS = "0.2" StatsBase = "0.33" StatsFuns = "0.9" -julia = "1" +julia = "1.6" [extras] -ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd" +ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["ForwardDiff", "Test"] +test = ["NLopt", "ReverseDiff", "Test"] diff --git a/src/Pathfinder.jl b/src/Pathfinder.jl index 5b6f70f5..3509ae50 100644 --- a/src/Pathfinder.jl +++ b/src/Pathfinder.jl @@ -1,7 +1,11 @@ module Pathfinder +using AbstractDifferentiation: AD using Distributions: Distributions +# ensure that ForwardDiff is conditionally loaded by GalacticOptim +using ForwardDiff: ForwardDiff using LinearAlgebra +using GalacticOptim: GalacticOptim using Optim: Optim, LineSearches using PDMats: PDMats using PSIS: PSIS @@ -21,7 +25,7 @@ const DEFAULT_OPTIMIZER = Optim.LBFGS(; ) include("woodbury.jl") -include("maximize.jl") +include("optimize.jl") include("inverse_hessian.jl") include("mvnormal.jl") include("elbo.jl") diff --git a/src/maximize.jl b/src/maximize.jl deleted file mode 100644 index 43ba75bc..00000000 --- a/src/maximize.jl +++ /dev/null @@ -1,21 +0,0 @@ -function maximize_with_trace(f, ∇f, x₀, optimizer; kwargs...) - negf(x) = -f(x) - g!(y, x) = (y .= .-∇f(x)) - - function callback(states) - # terminate if optimization encounters NaNs - s = states[end] - md = s.metadata - return isnan(s.value) || any(isnan, md["x"]) || any(isnan, md["g(x)"]) - end - options = Optim.Options(; - store_trace=true, extended_trace=true, callback=callback, kwargs... - ) - res = Optim.optimize(negf, g!, x₀, optimizer, options) - - xs = Optim.x_trace(res)::Vector{typeof(Optim.minimizer(res))} - fxs = -Optim.f_trace(res) - ∇fxs = map(tr -> -tr.metadata["g(x)"], Optim.trace(res))::typeof(xs) - - return xs, fxs, ∇fxs -end diff --git a/src/multipath.jl b/src/multipath.jl index 3615ffeb..37c8d019 100644 --- a/src/multipath.jl +++ b/src/multipath.jl @@ -1,7 +1,7 @@ """ multipathfinder( logp, - ∇logp, + [∇logp,] θ₀s::AbstractVector{AbstractVector{<:Real}}, ndraws::Int; kwargs... @@ -27,14 +27,17 @@ resulting draws better approximate draws from the target distribution ``p`` inst # Arguments - `logp`: a callable that computes the log-density of the target distribution. -- `∇logp`: a callable that computes the gradient of `logp`. -- `θ₀s`: vector of length `nruns` of initial points of length `dim` from which each - single-path Pathfinder run will begin -- `ndraws`: number of approximate draws to return +- `∇logp`: a callable that computes the gradient of `logp`. If not provided, `logp` is + automatically differentiated using the backend specified in `ad_backend`. +- `θ₀s::AbstractVector{AbstractVector{<:Real}}`: vector of length `nruns` of initial points + of length `dim` from which each single-path Pathfinder run will begin +- `ndraws::Int`: number of approximate draws to return # Keywords +- `ad_backend=AD.ForwardDiffBackend()`: AbstractDifferentiation.jl AD backend. - `ndraws_per_run::Int=5`: The number of draws to take for each component before resampling. - `importance::Bool=true`: Perform Pareto smoothed importance resampling of draws. +- `kwargs...` : Remaining keywords are forwarded to [`pathfinder`](@ref). # Returns - `q::Distributions.MixtureModel`: Uniformly weighted mixture of ELBO-maximizing @@ -42,11 +45,38 @@ resulting draws better approximate draws from the target distribution ``p`` inst - `ϕ::AbstractMatrix{<:Real}`: approximate draws from target distribution with size `(dim, ndraws)` - `component_inds::Vector{Int}`: Indices ``k`` of components in ``q`` from which each column -in `ϕ` was drawn. + in `ϕ` was drawn. """ +function multipathfinder(logp, θ₀s, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...) + optim_fun = build_optim_function(logp; ad_backend) + return multipathfinder(optim_fun, θ₀s, ndraws; kwargs...) +end function multipathfinder( - logp, - ∇logp, + logp, ∇logp, θ₀s, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs... +) + optim_fun = build_optim_function(logp, ∇logp; ad_backend) + return multipathfinder(optim_fun, θ₀s, ndraws; kwargs...) +end + +""" + multipathfinder( + f::GalacticOptim.OptimizationFunction, + θ₀s::AbstractVector{<:Real}, + ndraws::Int; + kwargs..., + ) + +Filter samples from a mixture of multivariate normal distributions fit using `pathfinder`. + +`f` is a user-created optimization function that represents the negative log density with +its gradient and must have the necessary features (e.g. a Hessian function or specified +automatic differentiation type) for the chosen optimization algorithm. For details, see +[GalacticOptim.jl: OptimizationFunction](https://galacticoptim.sciml.ai/stable/API/optimization_function/). + +See [`multipathfinder`](@ref) for a description of remaining arguments. +""" +function multipathfinder( + optim_fun::GalacticOptim.OptimizationFunction, θ₀s, ndraws; ndraws_per_run::Int=5, @@ -54,14 +84,18 @@ function multipathfinder( importance::Bool=true, kwargs..., ) + if optim_fun.grad === nothing || optim_fun.grad isa Bool + throw(ArgumentError("optimization function must define a gradient function.")) + end if ndraws > ndraws_per_run * length(θ₀s) @warn "More draws requested than total number of draws across replicas. Draws will not be unique." end + logp(x) = -optim_fun.f(x, nothing) # run pathfinder independently from each starting point # TODO: allow to be parallelized res = map(θ₀s) do θ₀ - return pathfinder(logp, ∇logp, θ₀, ndraws_per_run; rng=rng, kwargs...) + return pathfinder(optim_fun, θ₀, ndraws_per_run; rng, kwargs...) end qs = reduce(vcat, first.(res)) ϕs = reduce(hcat, getindex.(res, 2)) diff --git a/src/optimize.jl b/src/optimize.jl new file mode 100644 index 00000000..10bedd71 --- /dev/null +++ b/src/optimize.jl @@ -0,0 +1,61 @@ +function build_optim_function(f; ad_backend=AD.ForwardDiffBackend()) + ∇f(x) = only(AD.gradient(ad_backend, f, x)) + return build_optim_function(f, ∇f; ad_backend) +end +function build_optim_function(f, ∇f; ad_backend=AD.ForwardDiffBackend()) + # because we need explicit access to grad, we generate these ourselves instead of using + # GalacticOptim's auto-AD feature. + # TODO: switch to caching API if available, see + # https://github.com/JuliaDiff/AbstractDifferentiation.jl/issues/41 + function grad(res, x, p...) + ∇fx = ∇f(x) + @. res = -∇fx + return res + end + function hess(res, x, p...) + H = only(AD.hessian(ad_backend, f, x)) + @. res = -H + return res + end + function hv(res, x, v, p...) + Hv = only(AD.lazy_hessian(ad_backend, f, x) * v) + @. res = -Hv + return res + end + return GalacticOptim.OptimizationFunction((x, p...) -> -f(x); grad, hess, hv) +end + +function build_optim_problem(optim_fun, x₀; kwargs...) + return GalacticOptim.OptimizationProblem(optim_fun, x₀, nothing; kwargs...) +end + +function optimize_with_trace(prob, optimizer) + u0 = prob.u0 + fun = prob.f + grad! = fun.grad + function ∇f(x) + ∇fx = similar(x) + grad!(∇fx, x, nothing) + rmul!(∇fx, -1) + return ∇fx + end + # caches for the trace of x, f(x), and ∇f(x) + xs = typeof(u0)[] + fxs = typeof(fun.f(u0, nothing))[] + ∇fxs = typeof(similar(u0))[] + function callback(x, nfx, args...) + # NOTE: GalacticOptim doesn't have an interface for accessing the gradient trace, + # so we need to recompute it ourselves + # see https://github.com/SciML/GalacticOptim.jl/issues/149 + ∇fx = ∇f(x) + # terminate if optimization encounters NaNs + (isnan(nfx) || any(isnan, x) || any(isnan, ∇fx)) && return true + # some backends mutate x, so we must copy it + push!(xs, copy(x)) + push!(fxs, -nfx) + push!(∇fxs, ∇fx) + return false + end + GalacticOptim.solve(prob, optimizer; cb=callback) + return xs, fxs, ∇fxs +end diff --git a/src/singlepath.jl b/src/singlepath.jl index 3ef43fc9..f910ad3b 100644 --- a/src/singlepath.jl +++ b/src/singlepath.jl @@ -1,49 +1,112 @@ """ - pathfinder(logp, ∇logp, θ₀::AbstractVector{<:Real}, ndraws::Int; kwargs...) + pathfinder(logp[, ∇logp], θ₀::AbstractVector{<:Real}, ndraws::Int; kwargs...) -Find the best multivariate normal approximation encountered while optimizing `logp`. +Find the best multivariate normal approximation encountered while maximizing `logp`. -The multivariate normal approximation returned is the one that maximizes the evidence lower -bound (ELBO), or equivalently, minimizes the KL divergence between +From an optimization trajectory, Pathfinder constructs a sequence of (multivariate normal) +approximations to the distribution specified by `logp`. The approximation that maximizes the +evidence lower bound (ELBO), or equivalently, minimizes the KL divergence between the +approximation and the true distribution, is returned. + +The covariance of the multivariate normal distribution is an inverse Hessian approximation +constructed using at most the previous `history_length` steps. # Arguments - `logp`: a callable that computes the log-density of the target distribution. -- `∇logp`: a callable that computes the gradient of `logp`. +- `∇logp`: a callable that computes the gradient of `logp`. If not provided, `logp` is + automatically differentiated using the backend specified in `ad_backend`. - `θ₀`: initial point of length `dim` from which to begin optimization - `ndraws`: number of approximate draws to return # Keywords +- `ad_backend=AD.ForwardDiffBackend()`: AbstractDifferentiation.jl AD backend. - `rng::Random.AbstractRNG`: The random number generator to be used for drawing samples -- `optimizer::Optim.AbstractOptimizer`: Optimizer to be used for constructing trajectory. -Defaults to `Optim.LBFGS(; m=$DEFAULT_HISTORY_LENGTH, linesearch=$DEFAULT_LINE_SEARCH)`. +- `optimizer`: Optimizer to be used for constructing trajectory. Can be any optimizer + compatible with GalacticOptim, so long as it supports callbacks. Defaults to + `Optim.LBFGS(; m=$DEFAULT_HISTORY_LENGTH, linesearch=LineSearches.MoreThuente())`. See + the [GalacticOptim.jl documentation](https://galacticoptim.sciml.ai/stable) for details. - `history_length::Int=$DEFAULT_HISTORY_LENGTH`: Size of the history used to approximate the -inverse Hessian. This should only be set when `optimizer` is not an `Optim.LBFGS`. + inverse Hessian. This should only be set when `optimizer` is not an `Optim.LBFGS`. - `ndraws_elbo::Int=5`: Number of draws used to estimate the ELBO -- `kwargs...` : Remaining keywords are forwarded to `Optim.Options`. +- `kwargs...` : Remaining keywords are forwarded to `GalacticOptim.OptimizationProblem`. # Returns - `q::Distributions.MvNormal`: ELBO-maximizing multivariate normal distribution - `ϕ::AbstractMatrix{<:Real}`: draws from multivariate normal with size `(dim, ndraws)` - `logqϕ::Vector{<:Real}`: log-density of multivariate normal at columns of `ϕ` """ +function pathfinder(logp, θ₀, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...) + optim_fun = build_optim_function(logp; ad_backend) + return pathfinder(optim_fun, θ₀, ndraws; kwargs...) +end +function pathfinder(logp, ∇logp, θ₀, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...) + optim_fun = build_optim_function(logp, ∇logp; ad_backend) + return pathfinder(optim_fun, θ₀, ndraws; kwargs...) +end + +""" + pathfinder( + f::GalacticOptim.OptimizationFunction, + θ₀::AbstractVector{<:Real}, + ndraws::Int; + kwargs..., + ) + +Find the best multivariate normal approximation encountered while minimizing `f`. + +`f` is a user-created optimization function that represents the negative log density with +its gradient and must have the necessary features (e.g. a Hessian function or specified +automatic differentiation type) for the chosen optimization algorithm. For details, see +[GalacticOptim.jl: OptimizationFunction](https://galacticoptim.sciml.ai/stable/API/optimization_function/). + +See [`pathfinder`](@ref) for a description of remaining arguments. +""" function pathfinder( - logp, - ∇logp, + optim_fun::GalacticOptim.OptimizationFunction, θ₀, ndraws; rng::Random.AbstractRNG=Random.default_rng(), - optimizer::Optim.AbstractOptimizer=DEFAULT_OPTIMIZER, + optimizer=DEFAULT_OPTIMIZER, history_length::Int=optimizer isa Optim.LBFGS ? optimizer.m : DEFAULT_HISTORY_LENGTH, ndraws_elbo::Int=5, kwargs..., ) + optim_prob = build_optim_problem(optim_fun, θ₀; kwargs...) + return pathfinder(optim_prob, ndraws; rng, optimizer, history_length, ndraws_elbo) +end + +""" + pathfinder(prob::GalacticOptim.OptimizationProblem, ndraws::Int; kwargs...) + +Find the best multivariate normal approximation encountered while solving `prob`. + +`prob` is a user-created optimization problem that represents the negative log density with +its gradient, an initial position and must have the necessary features (e.g. a Hessian +function or specified automatic differentiation type) for the chosen optimization algorithm. +For details, see +[GalacticOptim.jl: Defining OptimizationProblems](https://galacticoptim.sciml.ai/stable/API/optimization_problem/). + +See [`pathfinder`](@ref) for a description of remaining arguments. +""" +function pathfinder( + optim_prob::GalacticOptim.OptimizationProblem, + ndraws; + rng::Random.AbstractRNG=Random.default_rng(), + optimizer=DEFAULT_OPTIMIZER, + history_length::Int=optimizer isa Optim.LBFGS ? optimizer.m : DEFAULT_HISTORY_LENGTH, + ndraws_elbo::Int=5, +) + if optim_prob.f.grad === nothing || optim_prob.f.grad isa Bool + throw(ArgumentError("optimization function must define a gradient function.")) + end + logp(x) = -optim_prob.f.f(x, nothing) # compute trajectory - θs, logpθs, ∇logpθs = maximize_with_trace(logp, ∇logp, θ₀, optimizer; kwargs...) + θs, logpθs, ∇logpθs = optimize_with_trace(optim_prob, optimizer) L = length(θs) - 1 @assert L + 1 == length(logpθs) == length(∇logpθs) # fit mv-normal distributions to trajectory - qs = fit_mvnormals(θs, ∇logpθs; history_length=history_length) + qs = fit_mvnormals(θs, ∇logpθs; history_length) # find ELBO-maximizing distribution lopt, elbo, ϕ, logqϕ = maximize_elbo(rng, logp, qs[2:end], ndraws_elbo) diff --git a/test/inverse_hessian.jl b/test/inverse_hessian.jl index cc58f641..d61d2cf6 100644 --- a/test/inverse_hessian.jl +++ b/test/inverse_hessian.jl @@ -48,18 +48,20 @@ end n = 10 history_length = 5 logp(x) = logp_banana(x) - ∇logp(x) = ForwardDiff.gradient(logp, x) nocedal_wright_scaling(α, s, y) = fill!(similar(α), dot(y, s) / sum(abs2, y)) θ₀ = 10 * randn(n) + ad_backend = AD.ForwardDiffBackend() + fun = Pathfinder.build_optim_function(logp; ad_backend) + prob = Pathfinder.build_optim_problem(fun, θ₀) optimizer = Optim.LBFGS(; m=history_length, linesearch=Optim.LineSearches.MoreThuente() ) - θs, logpθs, ∇logpθs = Pathfinder.maximize_with_trace(logp, ∇logp, θ₀, optimizer) + θs, logpθs, ∇logpθs = Pathfinder.optimize_with_trace(prob, optimizer) # run lbfgs_inverse_hessians with the same initialization as Optim.LBFGS Hs = Pathfinder.lbfgs_inverse_hessians( - θs, ∇logpθs; history_length=history_length, Hinit=nocedal_wright_scaling + θs, ∇logpθs; history_length, Hinit=nocedal_wright_scaling ) ss = diff(θs) ps = (Hs .* ∇logpθs)[1:(end - 1)] diff --git a/test/maximize.jl b/test/maximize.jl deleted file mode 100644 index 59444516..00000000 --- a/test/maximize.jl +++ /dev/null @@ -1,25 +0,0 @@ -using ForwardDiff -using LinearAlgebra -using Optim -using Pathfinder -using Test - -include("test_utils.jl") - -@testset "maximize_with_trace" begin - n = 10 - P = inv(rand_pd_mat(Float64, n)) - μ = randn(n) - f(x) = -dot(x - μ, P, x - μ) / 2 - ∇f(x) = ForwardDiff.gradient(f, x) - x0 = randn(n) - - @testset "$Topt" for Topt in (Optim.BFGS, Optim.LBFGS, Optim.ConjugateGradient) - optimizer = Topt() - xs, fxs, ∇fxs = Pathfinder.maximize_with_trace(f, ∇f, x0, optimizer) - @test xs[1] ≈ x0 - @test xs[end] ≈ μ - @test fxs ≈ f.(xs) - @test ∇fxs ≈ ∇f.(xs) - end -end diff --git a/test/multipath.jl b/test/multipath.jl index cb279ed5..885287af 100644 --- a/test/multipath.jl +++ b/test/multipath.jl @@ -1,23 +1,28 @@ -using LinearAlgebra +using AbstractDifferentiation using Distributions using ForwardDiff +using LinearAlgebra +using GalacticOptim using Pathfinder +using ReverseDiff using Test @testset "multi path pathfinder" begin @testset "MvNormal" begin + rng = MersenneTwister(42) n = 10 nruns = 20 ndraws = 1000_000 ndraws_per_run = ndraws ÷ nruns - Σ = rand_pd_mat(Float64, n) - μ = randn(n) + Σ = rand_pd_mat(rng, Float64, n) + μ = randn(rng, n) d = MvNormal(μ, Σ) logp(x) = logpdf(d, x) ∇logp(x) = ForwardDiff.gradient(logp, x) - x₀s = [rand(Uniform(-2, 2), n) for _ in 1:nruns] + x₀s = [rand(rng, Uniform(-2, 2), n) for _ in 1:nruns] + rng = MersenneTwister(76) q, ϕ, component_ids = multipathfinder( - logp, ∇logp, x₀s, ndraws; ndraws_elbo=100, ndraws_per_run=ndraws_per_run + logp, ∇logp, x₀s, ndraws; ndraws_elbo=100, ndraws_per_run, rng ) @test q isa MixtureModel @test ncomponents(q) == nruns @@ -40,5 +45,20 @@ using Test atol = sqrt(Σ[i, i] * Σ[j, j] / ndraws) * 10 * multiplier @test isapprox(Σ_hat[i, j], Σ[i, j], atol=atol) end + + rng = MersenneTwister(76) + ad_backend = AD.ReverseDiffBackend() + q2, ϕ2, component_ids2 = multipathfinder( + logp, x₀s, ndraws; ndraws_elbo=100, ndraws_per_run, rng, ad_backend + ) + for (c1, c2) in zip(q.components, q2.components) + @test c1 ≈ c2 atol = 1e-6 + end + end + @testset "errors if no gradient provided" begin + logp(x) = -sum(abs2, x) / 2 + x0s = [randn(5) for _ in 1:10] + fun = GalacticOptim.OptimizationFunction(logp, GalacticOptim.AutoForwardDiff()) + @test_throws ArgumentError multipathfinder(fun, x0s, 10) end end diff --git a/test/mvnormal.jl b/test/mvnormal.jl index f94b1fe7..87a3f793 100644 --- a/test/mvnormal.jl +++ b/test/mvnormal.jl @@ -1,3 +1,4 @@ +using AbstractDifferentiation using Distributions using ForwardDiff using Optim @@ -11,12 +12,15 @@ include("test_utils.jl") @testset "fit_mvnormals" begin n = 10 logp(x) = logp_banana(x) - ∇logp(x) = ForwardDiff.gradient(logp, x) θ₀ = 10 * randn(n) + ad_backend = AD.ForwardDiffBackend() + fun = Pathfinder.build_optim_function(logp; ad_backend) + prob = Pathfinder.build_optim_problem(fun, θ₀) optimizer = Optim.LBFGS() - θs, logpθs, ∇logpθs = Pathfinder.maximize_with_trace(logp, ∇logp, θ₀, optimizer) - Σs = Pathfinder.lbfgs_inverse_hessians(θs, ∇logpθs; history_length=optimizer.m) - dists = @inferred Pathfinder.fit_mvnormals(θs, ∇logpθs; history_length=optimizer.m) + history_length = optimizer.m + θs, logpθs, ∇logpθs = Pathfinder.optimize_with_trace(prob, optimizer) + Σs = Pathfinder.lbfgs_inverse_hessians(θs, ∇logpθs; history_length) + dists = @inferred Pathfinder.fit_mvnormals(θs, ∇logpθs; history_length) @test dists isa Vector{<:MvNormal{Float64,<:Pathfinder.WoodburyPDMat}} @test Σs ≈ getproperty.(dists, :Σ) @test θs .+ Σs .* ∇logpθs ≈ getproperty.(dists, :μ) diff --git a/test/optimize.jl b/test/optimize.jl new file mode 100644 index 00000000..f3bac899 --- /dev/null +++ b/test/optimize.jl @@ -0,0 +1,84 @@ +using AbstractDifferentiation +using ForwardDiff +using GalacticOptim +using LinearAlgebra +using NLopt +using Optim +using Pathfinder +using Test + +include("test_utils.jl") + +@testset "build_optim_function" begin + n = 20 + f(x) = logp_banana(x) + ∇f(x) = ForwardDiff.gradient(f, x) + ad_backend = AD.ForwardDiffBackend() + x = randn(n) + funs = [ + "user-provided gradient" => Pathfinder.build_optim_function(f, ∇f; ad_backend), + "automatic gradient" => Pathfinder.build_optim_function(f; ad_backend), + ] + @testset "$name" for (name, fun) in funs + @test fun isa GalacticOptim.OptimizationFunction + @test fun.f(x) ≈ -f(x) + ∇fx = similar(x) + fun.grad(∇fx, x, nothing) + @test ∇fx ≈ -∇f(x) + H = similar(x, n, n) + fun.hess(H, x, nothing) + @test H ≈ -ForwardDiff.hessian(f, x) + Hv = similar(x) + v = randn(n) + fun.hv(Hv, x, v, nothing) + @test Hv ≈ H * v + end +end + +@testset "build_optim_problem" begin + n = 20 + f(x) = logp_banana(x) + ∇f(x) = ForwardDiff.gradient(f, x) + ad_backend = AD.ForwardDiffBackend() + x0 = randn(n) + fun = Pathfinder.build_optim_function(f; ad_backend) + prob = Pathfinder.build_optim_problem(fun, x0) + @test prob isa GalacticOptim.OptimizationProblem + @test prob.f === fun + @test prob.u0 == x0 + @test prob.p === nothing +end + +@testset "optimize_with_trace" begin + n = 10 + P = inv(rand_pd_mat(Float64, n)) + μ = randn(n) + f(x) = -dot(x - μ, P, x - μ) / 2 + ∇f(x) = ForwardDiff.gradient(f, x) + + x0 = randn(n) + ad_backend = AD.ForwardDiffBackend() + fun = Pathfinder.build_optim_function(f; ad_backend) + prob = Pathfinder.build_optim_problem(fun, x0) + + optimizers = [ + Optim.BFGS(), Optim.LBFGS(), Optim.ConjugateGradient(), NLopt.Opt(:LD_LBFGS, n) + ] + @testset "$(typeof(optimizer))" for optimizer in optimizers + xs, fxs, ∇fxs = Pathfinder.optimize_with_trace(prob, optimizer) + @test xs[1] ≈ x0 + @test xs[end] ≈ μ + @test fxs ≈ f.(xs) + @test ∇fxs ≈ ∇f.(xs) atol = 1e-4 + + if !(optimizer isa NLopt.Opt) + options = Optim.Options(; store_trace=true, extended_trace=true) + res = Optim.optimize( + x -> -f(x), (y, x) -> copyto!(y, -∇f(x)), x0, optimizer, options + ) + @test Optim.iterations(res) == length(xs) - 1 + @test Optim.x_trace(res) ≈ xs + @test Optim.minimizer(res) ≈ xs[end] + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 61efbc33..309d8312 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,7 +3,7 @@ using Test @testset "Pathfinder.jl" begin include("woodbury.jl") - include("maximize.jl") + include("optimize.jl") include("inverse_hessian.jl") include("mvnormal.jl") include("elbo.jl") diff --git a/test/singlepath.jl b/test/singlepath.jl index 58fdbd8c..6fda9f90 100644 --- a/test/singlepath.jl +++ b/test/singlepath.jl @@ -1,6 +1,10 @@ -using LinearAlgebra +using AbstractDifferentiation using Distributions +using ForwardDiff +using GalacticOptim +using LinearAlgebra using Pathfinder +using ReverseDiff using Test @testset "single path pathfinder" begin @@ -40,7 +44,17 @@ using Test ∇logp(x) = -(P * x) x₀ = [2.08, 3.77, -1.26, -0.97, -3.91] rng = MersenneTwister(38) - q, _, _ = pathfinder(logp, ∇logp, x₀, 10; rng=rng, ndraws_elbo=100) + ad_backend = AD.ReverseDiffBackend() + q, _, _ = @inferred pathfinder(logp, x₀, 10; rng, ndraws_elbo=100, ad_backend) @test q.Σ ≈ Σ rtol = 1e-1 end + @testset "errors if no gradient provided" begin + logp(x) = -sum(abs2, x) / 2 + x0 = randn(5) + prob = GalacticOptim.OptimizationProblem(logp, x0, nothing) + @test_throws ArgumentError pathfinder(prob, 10) + fun = GalacticOptim.OptimizationFunction(logp, GalacticOptim.AutoForwardDiff()) + prob = GalacticOptim.OptimizationProblem(fun, x0, nothing) + @test_throws ArgumentError pathfinder(prob, 10) + end end diff --git a/test/test_utils.jl b/test/test_utils.jl index 69eaf468..29dcc4d6 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -1,12 +1,15 @@ using LinearAlgebra using Pathfinder +using Random -function rand_pd_mat(T, n) - U = qr(randn(T, n, n)).Q - return Matrix(Symmetric(U * rand_pd_diag_mat(T, n) * U')) +function rand_pd_mat(rng, T, n) + U = qr(randn(rng, T, n, n)).Q + return Matrix(Symmetric(U * rand_pd_diag_mat(rng, T, n) * U')) end +rand_pd_mat(T, n) = rand_pd_mat(Random.default_rng(), T, n) -rand_pd_diag_mat(T, n) = Diagonal(rand(T, n)) +rand_pd_diag_mat(rng, T, n) = Diagonal(rand(rng, T, n)) +rand_pd_diag_mat(T, n) = rand_pd_diag_mat(Random.default_rng(), T, n) # defined for testing purposes function Pathfinder.rand_and_logpdf(rng, dist, ndraws)