diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 6c23a8b9..0b3ca692 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -10,6 +10,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
+          - '1.6'
           - '1'
         os:
           - ubuntu-latest
diff --git a/Project.toml b/Project.toml
index a51fff30..bd071485 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,13 @@
 name = "Pathfinder"
 uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454"
 authors = ["Seth Axen <seth.axen@gmail.com> and contributors"]
-version = "0.2.3"
+version = "0.3.0-DEV"
 
 [deps]
+AbstractDifferentiation = "c29ec348-61ec-40c8-8164-b8c60e9d9f3d"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GalacticOptim = "a75be94c-b780-496d-a8a9-0878b188d577"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
@@ -15,17 +18,21 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 
 [compat]
+AbstractDifferentiation = "0.4"
 Distributions = "0.25"
+ForwardDiff = "0.10"
+GalacticOptim = "2"
 Optim = "1.4"
 PDMats = "0.11"
 PSIS = "0.2"
 StatsBase = "0.33"
 StatsFuns = "0.9"
-julia = "1"
+julia = "1.6"
 
 [extras]
-ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+NLopt = "76087f3c-5699-56af-9a33-bf431cd00edd"
+ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["ForwardDiff", "Test"]
+test = ["NLopt", "ReverseDiff", "Test"]
diff --git a/src/Pathfinder.jl b/src/Pathfinder.jl
index 5b6f70f5..3509ae50 100644
--- a/src/Pathfinder.jl
+++ b/src/Pathfinder.jl
@@ -1,7 +1,11 @@
 module Pathfinder
 
+using AbstractDifferentiation: AD
 using Distributions: Distributions
+# ensure that ForwardDiff is conditionally loaded by GalacticOptim
+using ForwardDiff: ForwardDiff
 using LinearAlgebra
+using GalacticOptim: GalacticOptim
 using Optim: Optim, LineSearches
 using PDMats: PDMats
 using PSIS: PSIS
@@ -21,7 +25,7 @@ const DEFAULT_OPTIMIZER = Optim.LBFGS(;
 )
 
 include("woodbury.jl")
-include("maximize.jl")
+include("optimize.jl")
 include("inverse_hessian.jl")
 include("mvnormal.jl")
 include("elbo.jl")
diff --git a/src/maximize.jl b/src/maximize.jl
deleted file mode 100644
index 43ba75bc..00000000
--- a/src/maximize.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-function maximize_with_trace(f, ∇f, x₀, optimizer; kwargs...)
-    negf(x) = -f(x)
-    g!(y, x) = (y .= .-∇f(x))
-
-    function callback(states)
-        # terminate if optimization encounters NaNs
-        s = states[end]
-        md = s.metadata
-        return isnan(s.value) || any(isnan, md["x"]) || any(isnan, md["g(x)"])
-    end
-    options = Optim.Options(;
-        store_trace=true, extended_trace=true, callback=callback, kwargs...
-    )
-    res = Optim.optimize(negf, g!, x₀, optimizer, options)
-
-    xs = Optim.x_trace(res)::Vector{typeof(Optim.minimizer(res))}
-    fxs = -Optim.f_trace(res)
-    ∇fxs = map(tr -> -tr.metadata["g(x)"], Optim.trace(res))::typeof(xs)
-
-    return xs, fxs, ∇fxs
-end
diff --git a/src/multipath.jl b/src/multipath.jl
index 3615ffeb..37c8d019 100644
--- a/src/multipath.jl
+++ b/src/multipath.jl
@@ -1,7 +1,7 @@
 """
     multipathfinder(
         logp,
-        ∇logp,
+        [∇logp,]
         θ₀s::AbstractVector{AbstractVector{<:Real}},
         ndraws::Int;
         kwargs...
@@ -27,14 +27,17 @@ resulting draws better approximate draws from the target distribution ``p`` inst
 
 # Arguments
 - `logp`: a callable that computes the log-density of the target distribution.
-- `∇logp`: a callable that computes the gradient of `logp`.
-- `θ₀s`: vector of length `nruns` of initial points of length `dim` from which each
-    single-path Pathfinder run will begin
-- `ndraws`: number of approximate draws to return
+- `∇logp`: a callable that computes the gradient of `logp`. If not provided, `logp` is
+    automatically differentiated using the backend specified in `ad_backend`.
+- `θ₀s::AbstractVector{AbstractVector{<:Real}}`: vector of length `nruns` of initial points
+    of length `dim` from which each single-path Pathfinder run will begin
+- `ndraws::Int`: number of approximate draws to return
 
 # Keywords
+- `ad_backend=AD.ForwardDiffBackend()`: AbstractDifferentiation.jl AD backend.
 - `ndraws_per_run::Int=5`: The number of draws to take for each component before resampling.
 - `importance::Bool=true`: Perform Pareto smoothed importance resampling of draws.
+- `kwargs...` : Remaining keywords are forwarded to [`pathfinder`](@ref).
 
 # Returns
 - `q::Distributions.MixtureModel`: Uniformly weighted mixture of ELBO-maximizing
@@ -42,11 +45,38 @@ resulting draws better approximate draws from the target distribution ``p`` inst
 - `ϕ::AbstractMatrix{<:Real}`: approximate draws from target distribution with size
     `(dim, ndraws)`
 - `component_inds::Vector{Int}`: Indices ``k`` of components in ``q`` from which each column
-in `ϕ` was drawn.
+    in `ϕ` was drawn.
 """
+function multipathfinder(logp, θ₀s, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...)
+    optim_fun = build_optim_function(logp; ad_backend)
+    return multipathfinder(optim_fun, θ₀s, ndraws; kwargs...)
+end
 function multipathfinder(
-    logp,
-    ∇logp,
+    logp, ∇logp, θ₀s, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...
+)
+    optim_fun = build_optim_function(logp, ∇logp; ad_backend)
+    return multipathfinder(optim_fun, θ₀s, ndraws; kwargs...)
+end
+
+"""
+    multipathfinder(
+        f::GalacticOptim.OptimizationFunction,
+        θ₀s::AbstractVector{<:Real},
+        ndraws::Int;
+        kwargs...,
+    )
+
+Filter samples from a mixture of multivariate normal distributions fit using `pathfinder`.
+
+`f` is a user-created optimization function that represents the negative log density with
+its gradient and must have the necessary features (e.g. a Hessian function or specified
+automatic differentiation type) for the chosen optimization algorithm. For details, see
+[GalacticOptim.jl: OptimizationFunction](https://galacticoptim.sciml.ai/stable/API/optimization_function/).
+
+See [`multipathfinder`](@ref) for a description of remaining arguments.
+"""
+function multipathfinder(
+    optim_fun::GalacticOptim.OptimizationFunction,
     θ₀s,
     ndraws;
     ndraws_per_run::Int=5,
@@ -54,14 +84,18 @@ function multipathfinder(
     importance::Bool=true,
     kwargs...,
 )
+    if optim_fun.grad === nothing || optim_fun.grad isa Bool
+        throw(ArgumentError("optimization function must define a gradient function."))
+    end
     if ndraws > ndraws_per_run * length(θ₀s)
         @warn "More draws requested than total number of draws across replicas. Draws will not be unique."
     end
+    logp(x) = -optim_fun.f(x, nothing)
 
     # run pathfinder independently from each starting point
     # TODO: allow to be parallelized
     res = map(θ₀s) do θ₀
-        return pathfinder(logp, ∇logp, θ₀, ndraws_per_run; rng=rng, kwargs...)
+        return pathfinder(optim_fun, θ₀, ndraws_per_run; rng, kwargs...)
     end
     qs = reduce(vcat, first.(res))
     ϕs = reduce(hcat, getindex.(res, 2))
diff --git a/src/optimize.jl b/src/optimize.jl
new file mode 100644
index 00000000..10bedd71
--- /dev/null
+++ b/src/optimize.jl
@@ -0,0 +1,61 @@
+function build_optim_function(f; ad_backend=AD.ForwardDiffBackend())
+    ∇f(x) = only(AD.gradient(ad_backend, f, x))
+    return build_optim_function(f, ∇f; ad_backend)
+end
+function build_optim_function(f, ∇f; ad_backend=AD.ForwardDiffBackend())
+    # because we need explicit access to grad, we generate these ourselves instead of using
+    # GalacticOptim's auto-AD feature.
+    # TODO: switch to caching API if available, see
+    # https://github.com/JuliaDiff/AbstractDifferentiation.jl/issues/41
+    function grad(res, x, p...)
+        ∇fx = ∇f(x)
+        @. res = -∇fx
+        return res
+    end
+    function hess(res, x, p...)
+        H = only(AD.hessian(ad_backend, f, x))
+        @. res = -H
+        return res
+    end
+    function hv(res, x, v, p...)
+        Hv = only(AD.lazy_hessian(ad_backend, f, x) * v)
+        @. res = -Hv
+        return res
+    end
+    return GalacticOptim.OptimizationFunction((x, p...) -> -f(x); grad, hess, hv)
+end
+
+function build_optim_problem(optim_fun, x₀; kwargs...)
+    return GalacticOptim.OptimizationProblem(optim_fun, x₀, nothing; kwargs...)
+end
+
+function optimize_with_trace(prob, optimizer)
+    u0 = prob.u0
+    fun = prob.f
+    grad! = fun.grad
+    function ∇f(x)
+        ∇fx = similar(x)
+        grad!(∇fx, x, nothing)
+        rmul!(∇fx, -1)
+        return ∇fx
+    end
+    # caches for the trace of x, f(x), and ∇f(x)
+    xs = typeof(u0)[]
+    fxs = typeof(fun.f(u0, nothing))[]
+    ∇fxs = typeof(similar(u0))[]
+    function callback(x, nfx, args...)
+        # NOTE: GalacticOptim doesn't have an interface for accessing the gradient trace,
+        # so we need to recompute it ourselves
+        # see https://github.com/SciML/GalacticOptim.jl/issues/149
+        ∇fx = ∇f(x)
+        # terminate if optimization encounters NaNs
+        (isnan(nfx) || any(isnan, x) || any(isnan, ∇fx)) && return true
+        # some backends mutate x, so we must copy it
+        push!(xs, copy(x))
+        push!(fxs, -nfx)
+        push!(∇fxs, ∇fx)
+        return false
+    end
+    GalacticOptim.solve(prob, optimizer; cb=callback)
+    return xs, fxs, ∇fxs
+end
diff --git a/src/singlepath.jl b/src/singlepath.jl
index 3ef43fc9..f910ad3b 100644
--- a/src/singlepath.jl
+++ b/src/singlepath.jl
@@ -1,49 +1,112 @@
 """
-    pathfinder(logp, ∇logp, θ₀::AbstractVector{<:Real}, ndraws::Int; kwargs...)
+    pathfinder(logp[, ∇logp], θ₀::AbstractVector{<:Real}, ndraws::Int; kwargs...)
 
-Find the best multivariate normal approximation encountered while optimizing `logp`.
+Find the best multivariate normal approximation encountered while maximizing `logp`.
 
-The multivariate normal approximation returned is the one that maximizes the evidence lower
-bound (ELBO), or equivalently, minimizes the KL divergence between
+From an optimization trajectory, Pathfinder constructs a sequence of (multivariate normal)
+approximations to the distribution specified by `logp`. The approximation that maximizes the
+evidence lower bound (ELBO), or equivalently, minimizes the KL divergence between the
+approximation and the true distribution, is returned.
+
+The covariance of the multivariate normal distribution is an inverse Hessian approximation
+constructed using at most the previous `history_length` steps.
 
 # Arguments
 - `logp`: a callable that computes the log-density of the target distribution.
-- `∇logp`: a callable that computes the gradient of `logp`.
+- `∇logp`: a callable that computes the gradient of `logp`. If not provided, `logp` is
+    automatically differentiated using the backend specified in `ad_backend`.
 - `θ₀`: initial point of length `dim` from which to begin optimization
 - `ndraws`: number of approximate draws to return
 
 # Keywords
+- `ad_backend=AD.ForwardDiffBackend()`: AbstractDifferentiation.jl AD backend.
 - `rng::Random.AbstractRNG`: The random number generator to be used for drawing samples
-- `optimizer::Optim.AbstractOptimizer`: Optimizer to be used for constructing trajectory.
-Defaults to `Optim.LBFGS(; m=$DEFAULT_HISTORY_LENGTH, linesearch=$DEFAULT_LINE_SEARCH)`.
+- `optimizer`: Optimizer to be used for constructing trajectory. Can be any optimizer
+    compatible with GalacticOptim, so long as it supports callbacks. Defaults to
+    `Optim.LBFGS(; m=$DEFAULT_HISTORY_LENGTH, linesearch=LineSearches.MoreThuente())`. See
+    the [GalacticOptim.jl documentation](https://galacticoptim.sciml.ai/stable) for details.
 - `history_length::Int=$DEFAULT_HISTORY_LENGTH`: Size of the history used to approximate the
-inverse Hessian. This should only be set when `optimizer` is not an `Optim.LBFGS`.
+    inverse Hessian. This should only be set when `optimizer` is not an `Optim.LBFGS`.
 - `ndraws_elbo::Int=5`: Number of draws used to estimate the ELBO
-- `kwargs...` : Remaining keywords are forwarded to `Optim.Options`.
+- `kwargs...` : Remaining keywords are forwarded to `GalacticOptim.OptimizationProblem`.
 
 # Returns
 - `q::Distributions.MvNormal`: ELBO-maximizing multivariate normal distribution
 - `ϕ::AbstractMatrix{<:Real}`: draws from multivariate normal with size `(dim, ndraws)`
 - `logqϕ::Vector{<:Real}`: log-density of multivariate normal at columns of `ϕ`
 """
+function pathfinder(logp, θ₀, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...)
+    optim_fun = build_optim_function(logp; ad_backend)
+    return pathfinder(optim_fun, θ₀, ndraws; kwargs...)
+end
+function pathfinder(logp, ∇logp, θ₀, ndraws; ad_backend=AD.ForwardDiffBackend(), kwargs...)
+    optim_fun = build_optim_function(logp, ∇logp; ad_backend)
+    return pathfinder(optim_fun, θ₀, ndraws; kwargs...)
+end
+
+"""
+    pathfinder(
+        f::GalacticOptim.OptimizationFunction,
+        θ₀::AbstractVector{<:Real},
+        ndraws::Int;
+        kwargs...,
+    )
+
+Find the best multivariate normal approximation encountered while minimizing `f`.
+
+`f` is a user-created optimization function that represents the negative log density with
+its gradient and must have the necessary features (e.g. a Hessian function or specified
+automatic differentiation type) for the chosen optimization algorithm. For details, see
+[GalacticOptim.jl: OptimizationFunction](https://galacticoptim.sciml.ai/stable/API/optimization_function/).
+
+See [`pathfinder`](@ref) for a description of remaining arguments.
+"""
 function pathfinder(
-    logp,
-    ∇logp,
+    optim_fun::GalacticOptim.OptimizationFunction,
     θ₀,
     ndraws;
     rng::Random.AbstractRNG=Random.default_rng(),
-    optimizer::Optim.AbstractOptimizer=DEFAULT_OPTIMIZER,
+    optimizer=DEFAULT_OPTIMIZER,
     history_length::Int=optimizer isa Optim.LBFGS ? optimizer.m : DEFAULT_HISTORY_LENGTH,
     ndraws_elbo::Int=5,
     kwargs...,
 )
+    optim_prob = build_optim_problem(optim_fun, θ₀; kwargs...)
+    return pathfinder(optim_prob, ndraws; rng, optimizer, history_length, ndraws_elbo)
+end
+
+"""
+    pathfinder(prob::GalacticOptim.OptimizationProblem, ndraws::Int; kwargs...)
+
+Find the best multivariate normal approximation encountered while solving `prob`.
+
+`prob` is a user-created optimization problem that represents the negative log density with
+its gradient, an initial position and must have the necessary features (e.g. a Hessian
+function or specified automatic differentiation type) for the chosen optimization algorithm.
+For details, see
+[GalacticOptim.jl: Defining OptimizationProblems](https://galacticoptim.sciml.ai/stable/API/optimization_problem/).
+
+See [`pathfinder`](@ref) for a description of remaining arguments.
+"""
+function pathfinder(
+    optim_prob::GalacticOptim.OptimizationProblem,
+    ndraws;
+    rng::Random.AbstractRNG=Random.default_rng(),
+    optimizer=DEFAULT_OPTIMIZER,
+    history_length::Int=optimizer isa Optim.LBFGS ? optimizer.m : DEFAULT_HISTORY_LENGTH,
+    ndraws_elbo::Int=5,
+)
+    if optim_prob.f.grad === nothing || optim_prob.f.grad isa Bool
+        throw(ArgumentError("optimization function must define a gradient function."))
+    end
+    logp(x) = -optim_prob.f.f(x, nothing)
     # compute trajectory
-    θs, logpθs, ∇logpθs = maximize_with_trace(logp, ∇logp, θ₀, optimizer; kwargs...)
+    θs, logpθs, ∇logpθs = optimize_with_trace(optim_prob, optimizer)
     L = length(θs) - 1
     @assert L + 1 == length(logpθs) == length(∇logpθs)
 
     # fit mv-normal distributions to trajectory
-    qs = fit_mvnormals(θs, ∇logpθs; history_length=history_length)
+    qs = fit_mvnormals(θs, ∇logpθs; history_length)
 
     # find ELBO-maximizing distribution
     lopt, elbo, ϕ, logqϕ = maximize_elbo(rng, logp, qs[2:end], ndraws_elbo)
diff --git a/test/inverse_hessian.jl b/test/inverse_hessian.jl
index cc58f641..d61d2cf6 100644
--- a/test/inverse_hessian.jl
+++ b/test/inverse_hessian.jl
@@ -48,18 +48,20 @@ end
         n = 10
         history_length = 5
         logp(x) = logp_banana(x)
-        ∇logp(x) = ForwardDiff.gradient(logp, x)
         nocedal_wright_scaling(α, s, y) = fill!(similar(α), dot(y, s) / sum(abs2, y))
         θ₀ = 10 * randn(n)
 
+        ad_backend = AD.ForwardDiffBackend()
+        fun = Pathfinder.build_optim_function(logp; ad_backend)
+        prob = Pathfinder.build_optim_problem(fun, θ₀)
         optimizer = Optim.LBFGS(;
             m=history_length, linesearch=Optim.LineSearches.MoreThuente()
         )
-        θs, logpθs, ∇logpθs = Pathfinder.maximize_with_trace(logp, ∇logp, θ₀, optimizer)
+        θs, logpθs, ∇logpθs = Pathfinder.optimize_with_trace(prob, optimizer)
 
         # run lbfgs_inverse_hessians with the same initialization as Optim.LBFGS
         Hs = Pathfinder.lbfgs_inverse_hessians(
-            θs, ∇logpθs; history_length=history_length, Hinit=nocedal_wright_scaling
+            θs, ∇logpθs; history_length, Hinit=nocedal_wright_scaling
         )
         ss = diff(θs)
         ps = (Hs .* ∇logpθs)[1:(end - 1)]
diff --git a/test/maximize.jl b/test/maximize.jl
deleted file mode 100644
index 59444516..00000000
--- a/test/maximize.jl
+++ /dev/null
@@ -1,25 +0,0 @@
-using ForwardDiff
-using LinearAlgebra
-using Optim
-using Pathfinder
-using Test
-
-include("test_utils.jl")
-
-@testset "maximize_with_trace" begin
-    n = 10
-    P = inv(rand_pd_mat(Float64, n))
-    μ = randn(n)
-    f(x) = -dot(x - μ, P, x - μ) / 2
-    ∇f(x) = ForwardDiff.gradient(f, x)
-    x0 = randn(n)
-
-    @testset "$Topt" for Topt in (Optim.BFGS, Optim.LBFGS, Optim.ConjugateGradient)
-        optimizer = Topt()
-        xs, fxs, ∇fxs = Pathfinder.maximize_with_trace(f, ∇f, x0, optimizer)
-        @test xs[1] ≈ x0
-        @test xs[end] ≈ μ
-        @test fxs ≈ f.(xs)
-        @test ∇fxs ≈ ∇f.(xs)
-    end
-end
diff --git a/test/multipath.jl b/test/multipath.jl
index cb279ed5..885287af 100644
--- a/test/multipath.jl
+++ b/test/multipath.jl
@@ -1,23 +1,28 @@
-using LinearAlgebra
+using AbstractDifferentiation
 using Distributions
 using ForwardDiff
+using LinearAlgebra
+using GalacticOptim
 using Pathfinder
+using ReverseDiff
 using Test
 
 @testset "multi path pathfinder" begin
     @testset "MvNormal" begin
+        rng = MersenneTwister(42)
         n = 10
         nruns = 20
         ndraws = 1000_000
         ndraws_per_run = ndraws ÷ nruns
-        Σ = rand_pd_mat(Float64, n)
-        μ = randn(n)
+        Σ = rand_pd_mat(rng, Float64, n)
+        μ = randn(rng, n)
         d = MvNormal(μ, Σ)
         logp(x) = logpdf(d, x)
         ∇logp(x) = ForwardDiff.gradient(logp, x)
-        x₀s = [rand(Uniform(-2, 2), n) for _ in 1:nruns]
+        x₀s = [rand(rng, Uniform(-2, 2), n) for _ in 1:nruns]
+        rng = MersenneTwister(76)
         q, ϕ, component_ids = multipathfinder(
-            logp, ∇logp, x₀s, ndraws; ndraws_elbo=100, ndraws_per_run=ndraws_per_run
+            logp, ∇logp, x₀s, ndraws; ndraws_elbo=100, ndraws_per_run, rng
         )
         @test q isa MixtureModel
         @test ncomponents(q) == nruns
@@ -40,5 +45,20 @@ using Test
             atol = sqrt(Σ[i, i] * Σ[j, j] / ndraws) * 10 * multiplier
             @test isapprox(Σ_hat[i, j], Σ[i, j], atol=atol)
         end
+
+        rng = MersenneTwister(76)
+        ad_backend = AD.ReverseDiffBackend()
+        q2, ϕ2, component_ids2 = multipathfinder(
+            logp, x₀s, ndraws; ndraws_elbo=100, ndraws_per_run, rng, ad_backend
+        )
+        for (c1, c2) in zip(q.components, q2.components)
+            @test c1 ≈ c2 atol = 1e-6
+        end
+    end
+    @testset "errors if no gradient provided" begin
+        logp(x) = -sum(abs2, x) / 2
+        x0s = [randn(5) for _ in 1:10]
+        fun = GalacticOptim.OptimizationFunction(logp, GalacticOptim.AutoForwardDiff())
+        @test_throws ArgumentError multipathfinder(fun, x0s, 10)
     end
 end
diff --git a/test/mvnormal.jl b/test/mvnormal.jl
index f94b1fe7..87a3f793 100644
--- a/test/mvnormal.jl
+++ b/test/mvnormal.jl
@@ -1,3 +1,4 @@
+using AbstractDifferentiation
 using Distributions
 using ForwardDiff
 using Optim
@@ -11,12 +12,15 @@ include("test_utils.jl")
     @testset "fit_mvnormals" begin
         n = 10
         logp(x) = logp_banana(x)
-        ∇logp(x) = ForwardDiff.gradient(logp, x)
         θ₀ = 10 * randn(n)
+        ad_backend = AD.ForwardDiffBackend()
+        fun = Pathfinder.build_optim_function(logp; ad_backend)
+        prob = Pathfinder.build_optim_problem(fun, θ₀)
         optimizer = Optim.LBFGS()
-        θs, logpθs, ∇logpθs = Pathfinder.maximize_with_trace(logp, ∇logp, θ₀, optimizer)
-        Σs = Pathfinder.lbfgs_inverse_hessians(θs, ∇logpθs; history_length=optimizer.m)
-        dists = @inferred Pathfinder.fit_mvnormals(θs, ∇logpθs; history_length=optimizer.m)
+        history_length = optimizer.m
+        θs, logpθs, ∇logpθs = Pathfinder.optimize_with_trace(prob, optimizer)
+        Σs = Pathfinder.lbfgs_inverse_hessians(θs, ∇logpθs; history_length)
+        dists = @inferred Pathfinder.fit_mvnormals(θs, ∇logpθs; history_length)
         @test dists isa Vector{<:MvNormal{Float64,<:Pathfinder.WoodburyPDMat}}
         @test Σs ≈ getproperty.(dists, :Σ)
         @test θs .+ Σs .* ∇logpθs ≈ getproperty.(dists, :μ)
diff --git a/test/optimize.jl b/test/optimize.jl
new file mode 100644
index 00000000..f3bac899
--- /dev/null
+++ b/test/optimize.jl
@@ -0,0 +1,84 @@
+using AbstractDifferentiation
+using ForwardDiff
+using GalacticOptim
+using LinearAlgebra
+using NLopt
+using Optim
+using Pathfinder
+using Test
+
+include("test_utils.jl")
+
+@testset "build_optim_function" begin
+    n = 20
+    f(x) = logp_banana(x)
+    ∇f(x) = ForwardDiff.gradient(f, x)
+    ad_backend = AD.ForwardDiffBackend()
+    x = randn(n)
+    funs = [
+        "user-provided gradient" => Pathfinder.build_optim_function(f, ∇f; ad_backend),
+        "automatic gradient" => Pathfinder.build_optim_function(f; ad_backend),
+    ]
+    @testset "$name" for (name, fun) in funs
+        @test fun isa GalacticOptim.OptimizationFunction
+        @test fun.f(x) ≈ -f(x)
+        ∇fx = similar(x)
+        fun.grad(∇fx, x, nothing)
+        @test ∇fx ≈ -∇f(x)
+        H = similar(x, n, n)
+        fun.hess(H, x, nothing)
+        @test H ≈ -ForwardDiff.hessian(f, x)
+        Hv = similar(x)
+        v = randn(n)
+        fun.hv(Hv, x, v, nothing)
+        @test Hv ≈ H * v
+    end
+end
+
+@testset "build_optim_problem" begin
+    n = 20
+    f(x) = logp_banana(x)
+    ∇f(x) = ForwardDiff.gradient(f, x)
+    ad_backend = AD.ForwardDiffBackend()
+    x0 = randn(n)
+    fun = Pathfinder.build_optim_function(f; ad_backend)
+    prob = Pathfinder.build_optim_problem(fun, x0)
+    @test prob isa GalacticOptim.OptimizationProblem
+    @test prob.f === fun
+    @test prob.u0 == x0
+    @test prob.p === nothing
+end
+
+@testset "optimize_with_trace" begin
+    n = 10
+    P = inv(rand_pd_mat(Float64, n))
+    μ = randn(n)
+    f(x) = -dot(x - μ, P, x - μ) / 2
+    ∇f(x) = ForwardDiff.gradient(f, x)
+
+    x0 = randn(n)
+    ad_backend = AD.ForwardDiffBackend()
+    fun = Pathfinder.build_optim_function(f; ad_backend)
+    prob = Pathfinder.build_optim_problem(fun, x0)
+
+    optimizers = [
+        Optim.BFGS(), Optim.LBFGS(), Optim.ConjugateGradient(), NLopt.Opt(:LD_LBFGS, n)
+    ]
+    @testset "$(typeof(optimizer))" for optimizer in optimizers
+        xs, fxs, ∇fxs = Pathfinder.optimize_with_trace(prob, optimizer)
+        @test xs[1] ≈ x0
+        @test xs[end] ≈ μ
+        @test fxs ≈ f.(xs)
+        @test ∇fxs ≈ ∇f.(xs) atol = 1e-4
+
+        if !(optimizer isa NLopt.Opt)
+            options = Optim.Options(; store_trace=true, extended_trace=true)
+            res = Optim.optimize(
+                x -> -f(x), (y, x) -> copyto!(y, -∇f(x)), x0, optimizer, options
+            )
+            @test Optim.iterations(res) == length(xs) - 1
+            @test Optim.x_trace(res) ≈ xs
+            @test Optim.minimizer(res) ≈ xs[end]
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 61efbc33..309d8312 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,7 +3,7 @@ using Test
 
 @testset "Pathfinder.jl" begin
     include("woodbury.jl")
-    include("maximize.jl")
+    include("optimize.jl")
     include("inverse_hessian.jl")
     include("mvnormal.jl")
     include("elbo.jl")
diff --git a/test/singlepath.jl b/test/singlepath.jl
index 58fdbd8c..6fda9f90 100644
--- a/test/singlepath.jl
+++ b/test/singlepath.jl
@@ -1,6 +1,10 @@
-using LinearAlgebra
+using AbstractDifferentiation
 using Distributions
+using ForwardDiff
+using GalacticOptim
+using LinearAlgebra
 using Pathfinder
+using ReverseDiff
 using Test
 
 @testset "single path pathfinder" begin
@@ -40,7 +44,17 @@ using Test
         ∇logp(x) = -(P * x)
         x₀ = [2.08, 3.77, -1.26, -0.97, -3.91]
         rng = MersenneTwister(38)
-        q, _, _ = pathfinder(logp, ∇logp, x₀, 10; rng=rng, ndraws_elbo=100)
+        ad_backend = AD.ReverseDiffBackend()
+        q, _, _ = @inferred pathfinder(logp, x₀, 10; rng, ndraws_elbo=100, ad_backend)
         @test q.Σ ≈ Σ rtol = 1e-1
     end
+    @testset "errors if no gradient provided" begin
+        logp(x) = -sum(abs2, x) / 2
+        x0 = randn(5)
+        prob = GalacticOptim.OptimizationProblem(logp, x0, nothing)
+        @test_throws ArgumentError pathfinder(prob, 10)
+        fun = GalacticOptim.OptimizationFunction(logp, GalacticOptim.AutoForwardDiff())
+        prob = GalacticOptim.OptimizationProblem(fun, x0, nothing)
+        @test_throws ArgumentError pathfinder(prob, 10)
+    end
 end
diff --git a/test/test_utils.jl b/test/test_utils.jl
index 69eaf468..29dcc4d6 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -1,12 +1,15 @@
 using LinearAlgebra
 using Pathfinder
+using Random
 
-function rand_pd_mat(T, n)
-    U = qr(randn(T, n, n)).Q
-    return Matrix(Symmetric(U * rand_pd_diag_mat(T, n) * U'))
+function rand_pd_mat(rng, T, n)
+    U = qr(randn(rng, T, n, n)).Q
+    return Matrix(Symmetric(U * rand_pd_diag_mat(rng, T, n) * U'))
 end
+rand_pd_mat(T, n) = rand_pd_mat(Random.default_rng(), T, n)
 
-rand_pd_diag_mat(T, n) = Diagonal(rand(T, n))
+rand_pd_diag_mat(rng, T, n) = Diagonal(rand(rng, T, n))
+rand_pd_diag_mat(T, n) = rand_pd_diag_mat(Random.default_rng(), T, n)
 
 # defined for testing purposes
 function Pathfinder.rand_and_logpdf(rng, dist, ndraws)