From f0a409191627029561847476b5093c97118a11ee Mon Sep 17 00:00:00 2001 From: odunbar Date: Mon, 5 Jun 2023 03:51:50 -0700 Subject: [PATCH] update to MIN ekp requirement of 1.1 add MT option to predict, and add schedulers in optimizer options, as well as early termination bugfix get_logpdf -> logpdf add scaling param for default prior remove unused function call allow scaling of prior New Lorenz example setup, compat with new timestepping learning compat with new scaling inc all cases aspect ratio and printing scalings for decent recovery julia format format --- Project.toml | 4 +- .../scalar_optimize_and_plot_RF.jl | 10 +- .../vector_optimize_and_plot_RF.jl | 27 ++++- examples/Lorenz/GModel_common.jl | 71 ++++++----- examples/Lorenz/calibrate.jl | 112 +++++++++++++----- examples/Lorenz/emulate_sample.jl | 27 ++++- src/Emulator.jl | 5 +- src/MarkovChainMonteCarlo.jl | 4 +- src/RandomFeature.jl | 62 ++++++++-- src/ScalarRandomFeature.jl | 59 +++++++-- src/VectorRandomFeature.jl | 62 ++++++++-- 11 files changed, 328 insertions(+), 115 deletions(-) diff --git a/Project.toml b/Project.toml index d462a684c..47cead28d 100644 --- a/Project.toml +++ b/Project.toml @@ -29,14 +29,14 @@ AbstractMCMC = "3.3, 4" AdvancedMH = "0.6, 0.7" Conda = "1.7" Distributions = "0.24, 0.25" +EnsembleKalmanProcesses = "1.1" DocStringExtensions = "0.8, 0.9" -EnsembleKalmanProcesses = "0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 1.0" GaussianProcesses = "0.12" MCMCChains = "4.14, 5, 6" PyCall = "1.93" +RandomFeatures = "0.3" ScikitLearn = "0.6, 0.7" StatsBase = "0.33, 0.34" -RandomFeatures = "0.3" julia = "1.6, 1.7, 1.8" [extras] diff --git a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl index d202f0391..b379ff689 100644 --- a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl +++ b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl @@ -126,8 +126,14 @@ end # setup random features n_features = 180 - -srfi = ScalarRandomFeatureInterface(n_features, p, diagonalize_input = diagonalize_input) +optimizer_options = Dict("n_iteration" => 20, "prior_in_scale" => 0.1, "verbose" => true) #"Max" iterations (may do less) + +srfi = ScalarRandomFeatureInterface( + n_features, + p, + diagonalize_input = diagonalize_input, + optimizer_options = optimizer_options, +) emulator = Emulator(srfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) println("build RF with $n training points and $(n_features) random features.") diff --git a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl index 05e4f348f..08d045c44 100644 --- a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl +++ b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl @@ -133,18 +133,37 @@ function main() # setup random features n_features = 200 + if case ∈ ["svd-diag", "svd-nondiag"] + optimizer_options = + Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 1.0, "verbose" => true) #"Max" iterations (may do less) + else # without svd + optimizer_options = + Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 0.01, "verbose" => true) #"Max" iterations (may do less) + end if case == "svd-diag" - vrfi = VectorRandomFeatureInterface(n_features, p, d, diagonalize_output = true) + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + diagonalize_output = true, + optimizer_options = optimizer_options, + ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) elseif case == "svd-nondiag" - vrfi = VectorRandomFeatureInterface(n_features, p, d) + vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true) elseif case == "nosvd-diag" - vrfi = VectorRandomFeatureInterface(n_features, p, d, diagonalize_output = true) + vrfi = VectorRandomFeatureInterface( + n_features, + p, + d, + diagonalize_output = true, + optimizer_options = optimizer_options, + ) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false) elseif case == "nosvd-nondiag" - vrfi = VectorRandomFeatureInterface(n_features, p, d) + vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options) emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false) end println("build RF with $n training points and $(n_features) random features.") diff --git a/examples/Lorenz/GModel_common.jl b/examples/Lorenz/GModel_common.jl index 2f5993f98..d3a99de5c 100644 --- a/examples/Lorenz/GModel_common.jl +++ b/examples/Lorenz/GModel_common.jl @@ -128,22 +128,25 @@ end # tend: End of simulation Float64(1), nstep: function lorenz_solve(settings::LSettings, params::LParams) # Initialize - nstep = Int32(ceil(settings.tend / settings.dt)) + nstep = Int32(ceil((settings.tend - settings.t_start) / settings.dt)) xn = zeros(Float64, settings.N, nstep) t = zeros(Float64, nstep) # Initial perturbation X = fill(Float64(0.0), settings.N) X = X + settings.Fp + Xbuffer = zeros(4, settings.N) # March forward in time for j in 1:nstep - t[j] = settings.dt * j + t[j] = settings.t_start + settings.dt * j + #use view to update a slice + # RK4! modifies first and last arguments if settings.dynamics == 1 - X = RK4(X, settings.dt, settings.N, params.F) + RK4!(X, settings.dt, settings.N, params.F, Xbuffer) elseif settings.dynamics == 2 F_local = params.F + params.A * sin(params.ω * t[j]) - X = RK4(X, settings.dt, settings.N, F_local) + RK4!(X, settings.dt, settings.N, F_local, Xbuffer) end - xn[:, j] = X + xn[:, j] += X end # Output return xn, t @@ -151,33 +154,38 @@ function lorenz_solve(settings::LSettings, params::LParams) end # Lorenz 96 system -# f = dx/dt +# f = dx/dt overwriting first argument # Inputs: x: state, N: longitude steps, F: forcing -function f(x, N, F) - f = zeros(Float64, N) +function f!(xnew, x, N, F) + # Loop over N positions for i in 3:(N - 1) - f[i] = -x[i - 2] * x[i - 1] + x[i - 1] * x[i + 1] - x[i] + F + xnew[i] = -x[i - 2] * x[i - 1] + x[i - 1] * x[i + 1] - x[i] + F end # Periodic boundary conditions - f[1] = -x[N - 1] * x[N] + x[N] * x[2] - x[1] + F - f[2] = -x[N] * x[1] + x[1] * x[3] - x[2] + F - f[N] = -x[N - 2] * x[N - 1] + x[N - 1] * x[1] - x[N] + F + xnew[1] = -x[N - 1] * x[N] + x[N] * x[2] - x[1] + F + xnew[2] = -x[N] * x[1] + x[1] * x[3] - x[2] + F + xnew[N] = -x[N - 2] * x[N - 1] + x[N - 1] * x[1] - x[N] + F # Output - return f + return nothing end -# RK4 solve -function RK4(xold, dt, N, F) - # Predictor steps - k1 = f(xold, N, F) - k2 = f(xold + k1 * dt / 2.0, N, F) - k3 = f(xold + k2 * dt / 2.0, N, F) - k4 = f(xold + k3 * dt, N, F) +# RK4 solve, overwriting first argument +function RK4!(xold, dt, N, F, buffer) + #buffer is 4 x N zeros + @assert size(buffer) == (4, N) + + # Predictor steps updates the final argument + # must use view to update a slice in-place + f!(view(buffer, 1, :), xold, N, F) #k1 + f!(view(buffer, 2, :), xold + buffer[1, :] * dt / 2.0, N, F) #k2 + f!(view(buffer, 3, :), xold + buffer[2, :] * dt / 2.0, N, F) #k3 + f!(view(buffer, 4, :), xold + buffer[3, :] * dt, N, F) #k4 # Step - xnew = xold + (dt / 6.0) * (k1 + 2.0 * k2 + 2.0 * k3 + k4) - # Output - return xnew + #xnew = xold + (dt / 6.0) * (k1 + 2.0 * k2 + 2.0 * k3 + k4) + xold .+= (dt / 6.0) * (buffer[1, :] + 2.0 * buffer[2, :] + 2.0 * buffer[3, :] + buffer[4, :]) + + return nothing end function regression(X, y) @@ -216,7 +224,8 @@ end ############################### function stats(settings, xn, t) # Define averaging indices range - indices = findall(x -> (x > settings.t_start) && (x < settings.t_start + settings.T), t) + indices = findall(x -> (x > settings.t_start) && (x < settings.tend), t) + # Define statistics of interest if settings.stats_type == 1 # Mean # Average in time and over longitude @@ -317,25 +326,15 @@ function stats(settings, xn, t) var_slide = zeros(N) t_slide = zeros(nt, N) for i in 1:N - var_slide[i] = var(vcat(xn[:, indices[((i - 1) * nt + 1):(i * nt)]]...)) + var_slide[i] = var(xn[:, indices[((i - 1) * nt + 1):(i * nt)]]) t_slide[:, i] = t[((i - 1) * nt + 1):(i * nt)] end - # Polynomial fit over a batch of sliding windows - Tfit = Int64(settings.Tfit) + Tfit = Int64(settings.Tfit) # (a multiple of Ts) NT = Int64(N / Tfit) - a1 = zeros(NT) - a2 = zeros(NT) - t_start = zeros(NT) - y = zeros(Tfit, NT) - ty = zeros(Tfit, NT) ym = zeros(NT) for i in 1:NT ind_low = (i - 1) * Tfit + 1 ind_high = i * Tfit - t_start[i] = t_slide[1, ind_low] - ty[:, i] = t_slide[1, ind_low:ind_high] .- t_start[i] - a1[i], a2[i] = regression(ty[:, i], var_slide[ind_low:ind_high]) - y[:, i] = a1[i] .* ty[:, i] .+ a2[i] ym[i] = mean(var_slide[ind_low:ind_high]) end # Combine diff --git a/examples/Lorenz/calibrate.jl b/examples/Lorenz/calibrate.jl index b337a653d..a799899a0 100644 --- a/examples/Lorenz/calibrate.jl +++ b/examples/Lorenz/calibrate.jl @@ -18,8 +18,7 @@ const PD = EKP.ParameterDistributions function main() rng_seed = 4137 - Random.seed!(rng_seed) - + rng = Random.MersenneTwister(rng_seed) # Output figure save directory homedir = pwd() println(homedir) @@ -39,11 +38,11 @@ function main() dynamics = 2 # Transient is 2 # Statistics integration length # This has to be less than 360 and 360 must be divisible by Ts_days - Ts_days = 30.0 # Integration length in days + Ts_days = 30.0 # Integration length in days (will be made non-dimensional later) # Stats type, which statistics to construct from the L96 system # 4 is a linear fit over a batch of length Ts_days - # 5 is the mean over a batch of length Ts_days stats_type = 5 + # 5 is the mean over batches of length Ts_days: ### @@ -52,7 +51,7 @@ function main() # Define the parameters that we want to learn F_true = 8.0 # Mean F A_true = 2.5 # Transient F amplitude - ω_true = 2.0 * π / (360.0 / τc) # Frequency of the transient F + ω_true = 2.0 * π / (360.0 / τc) # Frequency of the transient F (non-dim) if dynamics == 2 params_true = [F_true, A_true] @@ -99,6 +98,7 @@ function main() dt = 1 / 64.0 # Start of integration t_start = 800.0 + # We now rescale all the timings etc, to be in nondim-time # Data collection length if dynamics == 1 T = 2.0 @@ -117,10 +117,12 @@ function main() # Use CES to learn ω? ω_fixed = true - # Settings + # Note, the initial conditions are given by Fp, and the integration is run from this + # one can change the ICs of a simulation therefore by changing Fp # Constructs an LSettings structure, see GModel.jl for the descriptions lorenz_settings = GModel.LSettings(dynamics, stats_type, t_start, T, Ts, Tfit, Fp, N, dt, t_start + T, kmax, ω_fixed, ω_true) + println(lorenz_settings) lorenz_params = GModel.LParams(F_true, ω_true, A_true) ### @@ -140,38 +142,52 @@ function main() if var_prescribe == true n_samples = 100 yt = zeros(length(gt), n_samples) - noise_level = 0.05 - Γy = noise_level * convert(Array, Diagonal(gt)) + noise_sd = 0.25 + # Γy = noise_sd^2 * convert(Array, Diagonal(gt)) + Γy = noise_sd^2 * convert(Array, Diagonal(ones(length(gt)) * mean(gt))) μ = zeros(length(gt)) # Add noise for i in 1:n_samples yt[:, i] = gt .+ rand(MvNormal(μ, Γy)) end + println(Γy) + else println("Using truth values to compute covariance") - n_samples = 20 + n_samples = 100 nthreads = Threads.nthreads() yt = zeros(nthreads, length(gt), n_samples) Threads.@threads for i in 1:n_samples - lorenz_settings_local = GModel.LSettings( - dynamics, - stats_type, - t_start + T * (i - 1), - T, - Ts, - Tfit, - Fp, - N, - dt, - t_start + T * (i - 1) + T, - kmax, - ω_fixed, - ω_true, - ) + + #= + lorenz_settings_local = GModel.LSettings( + dynamics, + stats_type, + t_start + T * (i - 1), + T, + Ts, + Tfit, + Fp, + N, + dt, + t_start + T * (i - 1) + T, + kmax, + ω_fixed, + ω_true, + )=# + lorenz_settings_local = lorenz_settings tid = Threads.threadid() yt[tid, :, i] = GModel.run_G_ensemble(params_true, lorenz_settings_local) end yt = dropdims(sum(yt, dims = 1), dims = 1) + + # add a little extra variance for regularization here + noise_sd = 0.1 + Γy_diag = noise_sd .^ 2 * convert(Array, I(size(yt, 1))) + μ = zeros(size(yt, 1)) + # Add noise + yt += rand(MvNormal(μ, Γy_diag), size(yt, 2)) + # Variance of truth data #Γy = convert(Array, Diagonal(dropdims(mean((yt.-mean(yt,dims=1)).^2,dims=1),dims=1))) # Covariance of truth data @@ -181,9 +197,10 @@ function main() end + # Construct observation object truth = Observations.Observation(yt, Γy, data_names) - truth_sample = truth.mean + truth_sample = yt[:, end] ### ### Calibrate: Ensemble Kalman Inversion ### @@ -192,23 +209,36 @@ function main() lorenz_settings_G = lorenz_settings # initialize to truth settings # EKP parameters - N_ens = 20 # number of ensemble members - N_iter = 6 # number of EKI iterations + N_ens = 30 # number of ensemble members + N_iter = 20 # number of EKI iterations # initial parameters: N_params x N_ens - initial_params = EKP.construct_initial_ensemble(priors, N_ens; rng_seed = rng_seed) + initial_params = EKP.construct_initial_ensemble(rng, priors, N_ens) - ekiobj = EKP.EnsembleKalmanProcess(initial_params, truth_sample, truth.obs_noise_cov, EKP.Inversion()) + ekiobj = EKP.EnsembleKalmanProcess( + initial_params, + truth_sample, + truth.obs_noise_cov, + EKP.Inversion(), + scheduler = EKP.DataMisfitController(), + verbose = true, + ) # EKI iterations println("EKP inversion error:") err = zeros(N_iter) + final_iter = [N_iter] for i in 1:N_iter params_i = EKP.get_ϕ_final(priors, ekiobj) # the `ϕ` indicates that the `params_i` are in the constrained space g_ens = GModel.run_G_ensemble(params_i, lorenz_settings_G) - EKP.update_ensemble!(ekiobj, g_ens) + terminated = EKP.update_ensemble!(ekiobj, g_ens) + if !isnothing(terminated) + final_iter = i - 1 # final update was previous iteration + break + end err[i] = EKP.get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2) println("Iteration: " * string(i) * ", Error: " * string(err[i])) end + N_iter = final_iter[1] #in case it terminated early # EKI results: Has the ensemble collapsed toward the truth? println("True parameters: ") @@ -237,6 +267,28 @@ function main() "truth_input_constrained", params_true, #constrained here, as these are in a physically constrained space (unlike the u inputs), ) + + # plots in constrained space + gr(dpi = 300, size = (400, 400)) + ϕ_init = EKP.get_ϕ(priors, ekiobj, 1) + p = plot( + title = "EKI iterations", + xlims = extrema(ϕ_init[1, :]), + xaxis = "F", + yaxis = "A", + ylims = extrema(ϕ_init[2, :]), + ) + for i in 1:N_iter + ϕ_i = EKP.get_ϕ(priors, ekiobj, i) + scatter!(p, ϕ_i[1, :], ϕ_i[2, :], color = :grey, label = false) + end + ϕ_fin = EKP.get_ϕ_final(priors, ekiobj) + scatter!(p, ϕ_fin[1, :], ϕ_fin[2, :], color = :magenta, label = false) + + vline!(p, [params_true[1]], linestyle = :dash, linecolor = :red, label = false) + hline!(p, [params_true[2]], linestyle = :dash, linecolor = :red, label = false) + savefig(p, joinpath(figure_save_directory, "calibration.png")) + savefig(p, joinpath(figure_save_directory, "calibration.pdf")) end main() diff --git a/examples/Lorenz/emulate_sample.jl b/examples/Lorenz/emulate_sample.jl index 48a304f6c..7ea16c67c 100644 --- a/examples/Lorenz/emulate_sample.jl +++ b/examples/Lorenz/emulate_sample.jl @@ -47,13 +47,23 @@ function main() #### CHOOSE YOUR CASE: mask = 2:7 - for case in cases[mask] - + # One day we can use good heuristics here + # Currently set so that learnt hyperparameters stays relatively far inside the prior. (use "verbose" => true in optimizer overrides to see this info) + prior_scalings = [[0.0, 0.0], [1e-2, 0.0], [1e-1, 0.0], [1e-2, 1e-2], [1e-2, 1e-1], [1e-2, 1e-2], [1e-2, 1e-2]] + for (case, scaling) in zip(cases[mask], prior_scalings[mask]) #case = cases[7] println("case: ", case) - - overrides = Dict("train_fraction" => 0.6, "verbose" => true) + max_iter = 6 # number of EKP iterations to use data from is at most this + overrides = Dict( + "verbose" => true, + "train_fraction" => 0.8, + "n_iteration" => 40, + "scheduler" => DataMisfitController(), + "prior_in_scale" => scaling[1], + "prior_out_scale" => scaling[2], + ) + # we do not want termination, as our priors have relatively little interpretation # Should be loaded: τc = 5.0 @@ -88,6 +98,7 @@ function main() truth_params_constrained = load(data_save_file)["truth_input_constrained"] #true parameters in constrained space truth_params = transform_constrained_to_unconstrained(priors, truth_params_constrained) Γy = ekiobj.obs_noise_cov + println(Γy) n_params = length(truth_params) # "input dim" output_dim = size(Γy, 1) @@ -120,6 +131,7 @@ function main() # do we want to assume that the outputs are decorrelated in the machine-learning problem? diagonalize_output = case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag"] ? true : false n_features = 300 + mlt = VectorRandomFeatureInterface( n_features, n_params, @@ -141,18 +153,21 @@ function main() println(norm_factor) # Get training points from the EKP iteration number in the second input term - N_iter = 6 # note we have 6 iterations stored + N_iter = min(max_iter, length(get_u(ekiobj)) - 1) # number of paired iterations taken from EKP + input_output_pairs = Utilities.get_training_points(ekiobj, N_iter - 1) # 1:N-1 + input_output_pairs_test = Utilities.get_training_points(ekiobj, N_iter:N_iter) # just "next" iteration used for testing # Save data @save joinpath(data_save_directory, "input_output_pairs.jld2") input_output_pairs - standardize = true + standardize = false retained_svd_frac = 1.0 normalized = true # do we want to use SVD to decorrelate outputs decorrelate = case ∈ ["RF-vector-nosvd-diag", "RF-vector-nosvd-nondiag"] ? false : true + emulator = Emulator( mlt, input_output_pairs; diff --git a/src/Emulator.jl b/src/Emulator.jl index 6f4d2572d..4eeb360d1 100644 --- a/src/Emulator.jl +++ b/src/Emulator.jl @@ -40,7 +40,7 @@ end function optimize_hyperparameters!(mlt) throw_define_mlt() end -function predict(mlt, new_inputs) +function predict(mlt, new_inputs; mlt_kwargs...) throw_define_mlt() end @@ -187,6 +187,7 @@ function predict( new_inputs::AbstractMatrix{FT}; transform_to_real = false, vector_rf_unstandardize = true, + mlt_kwargs... ) where {FT <: AbstractFloat} # Check if the size of new_inputs is consistent with the GP model's input # dimension. @@ -200,7 +201,7 @@ function predict( normalized_new_inputs = normalize(emulator, new_inputs) # [2.] predict. Note: ds = decorrelated, standard - ds_outputs, ds_output_var = predict(emulator.machine_learning_tool, normalized_new_inputs) + ds_outputs, ds_output_var = predict(emulator.machine_learning_tool, normalized_new_inputs, mlt_kwargs...) # [3.] transform back to real coordinates or remain in decorrelated coordinates if !isa(get_machine_learning_tool(emulator), VectorRandomFeatureInterface) diff --git a/src/MarkovChainMonteCarlo.jl b/src/MarkovChainMonteCarlo.jl index 2e376b32e..6faa986d0 100644 --- a/src/MarkovChainMonteCarlo.jl +++ b/src/MarkovChainMonteCarlo.jl @@ -170,9 +170,9 @@ function EmulatorPosteriorModel( #TODO vector_rf will always unstandardize, but other methods will not, so we require this additional flag. if isa(g_cov[1], Real) - return logpdf(MvNormal(obs_sample, g_cov[1] * I), vec(g)) + get_logpdf(prior, θ) + return logpdf(MvNormal(obs_sample, g_cov[1] * I), vec(g)) + logpdf(prior, θ) else - return logpdf(MvNormal(obs_sample, g_cov[1]), vec(g)) + get_logpdf(prior, θ) + return logpdf(MvNormal(obs_sample, g_cov[1]), vec(g)) + logpdf(prior, θ) end end, diff --git a/src/RandomFeature.jl b/src/RandomFeature.jl index 3465d2c0d..f72cbf2f3 100644 --- a/src/RandomFeature.jl +++ b/src/RandomFeature.jl @@ -8,6 +8,8 @@ using ..Utilities using StableRNGs using Random +export calculate_n_hyperparameters, build_default_prior + abstract type RandomFeatureInterface <: MachineLearningTool end abstract type MultithreadType end @@ -94,51 +96,89 @@ $(DocStringExtensions.TYPEDSIGNATURES) Builds a prior over the hyperparameters (i.e. the cholesky/diagaonal or individaul entries of the input/output covariances). For example, the case where the input covariance ``U = γ_1 * (LL^T + γ_2 I)``, we set priors for the entries of the lower triangular matrix ``L`` as normal, and constant scalings ``γ_i`` as log-normal to retain positivity. + +priors can be scaled by a constant factor by using the in_scale and out_scale keywords """ -function build_default_prior(input_dim::Int, output_dim::Int, diagonalize_input::Bool, diagonalize_output::Bool) +function build_default_prior( + input_dim::Int, + output_dim::Int, + diagonalize_input::Bool, + diagonalize_output::Bool; + in_scale = 1.0, + out_scale = 1.0, +) n_hp_in = n_hyperparameters_cov(input_dim, diagonalize_input) n_hp_out = n_hyperparameters_cov(output_dim, diagonalize_output) + # aspect ratio for mean:sd for positive parameters in constrained_gaussian() + # 10 seems to give a decent range, this isn't very tuneable so the alternative would be to use the basic constructor + pos_asp_ratio = 10 # if dim = 1 , positive constant # elseif diagonalized, positive on diagonal and positive scalings # else chol factor, and positive scalings if input_dim > 1 if diagonalize_input - param_diag = constrained_gaussian("input_prior_diagonal", 1.0, 1.0, 0.0, Inf, repeats = n_hp_in - 2) - param_scaling = constrained_gaussian("input_prior_scaling", 1.0, 1.0, 0.0, Inf, repeats = 2) + param_diag = + constrained_gaussian("input_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_in - 2) + param_scaling = + constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2) input_prior = combine_distributions([param_diag, param_scaling]) else param_chol = constrained_gaussian("input_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_in - 2) - param_scaling = constrained_gaussian("input_prior_scaling", 1.0, 1.0, 0.0, Inf, repeats = 2) + param_scaling = + constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2) input_prior = combine_distributions([param_chol, param_scaling]) end else - input_prior = constrained_gaussian("input_prior", 1.0, 1.0, 0.0, Inf) + input_prior = constrained_gaussian("input_prior", in_scale, in_scale, 0.0, Inf) end if output_dim > 1 if diagonalize_output - param_diag = constrained_gaussian("output_prior_diagonal", 1.0, 1.0, 0.0, Inf, repeats = n_hp_out - 2) - param_scaling = constrained_gaussian("output_prior_scaling", 1.0, 1.0, 0.0, Inf, repeats = 2) + param_diag = + constrained_gaussian("output_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_out - 2) + param_scaling = constrained_gaussian( + "output_prior_scaling", + out_scale, + out_scale * pos_asp_ratio, + 0.0, + Inf, + repeats = 2, + ) output_prior = combine_distributions([param_diag, param_scaling]) else param_chol = constrained_gaussian("output_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_out - 2) - param_scaling = constrained_gaussian("output_prior_scaling", 1.0, 1.0, 0.0, Inf, repeats = 2) + param_scaling = constrained_gaussian( + "output_prior_scaling", + out_scale, + out_scale * pos_asp_ratio, + 0.0, + Inf, + repeats = 2, + ) output_prior = combine_distributions([param_chol, param_scaling]) end else - output_prior = constrained_gaussian("output_prior", 1.0, 1.0, 0.0, Inf) + output_prior = constrained_gaussian("output_prior", out_scale, 10 * out_scale, 0.0, Inf) end return combine_distributions([input_prior, output_prior]) end -function build_default_prior(input_dim::Int, diagonalize_input::Bool) +function build_default_prior(input_dim::Int, diagonalize_input::Bool, in_scale = 1.0) #scalar case consistent with vector case output_dim = 1 diagonalize_output = false - return build_default_prior(input_dim, output_dim, diagonalize_input, diagonalize_output) + out_scale = 1.0 + return build_default_prior( + input_dim, + output_dim, + diagonalize_input, + diagonalize_output, + in_scale = in_scale, + out_scale = out_scale, + ) end """ diff --git a/src/ScalarRandomFeature.jl b/src/ScalarRandomFeature.jl index dbc26d11b..626c7bed9 100644 --- a/src/ScalarRandomFeature.jl +++ b/src/ScalarRandomFeature.jl @@ -107,9 +107,11 @@ Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available - `optimizer_options = nothing` - Dict of options to pass into EKI optimization of hyperparameters (defaults created in `ScalarRandomFeatureInterface` constructor): - "prior": the prior for the hyperparameter optimization + - "prior_in_scale": use this to tune the input prior scale - "n_ensemble": number of ensemble members - "n_iteration": number of eki iterations - - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) + - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) + - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController - "tikhonov": tikhonov regularization parameter if >0 - "inflation": additive inflation ∈ [0,1] with 0 being no inflation - "train_fraction": e.g. 0.8 (default) means 80:20 train - test split @@ -129,15 +131,15 @@ function ScalarRandomFeatureInterface( rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0) fitted_features = Vector{RF.Methods.Fit}(undef, 0) - - n_hp = calculate_n_hyperparameters(input_dim, diagonalize_input) - prior = build_default_prior(input_dim, diagonalize_input) + prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0 + prior = build_default_prior(input_dim, diagonalize_input, prior_in_scale) # default optimizer settings optimizer_opts = Dict( "prior" => prior, #the hyperparameter_prior "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble "n_iteration" => 5, # number of eki iterations + "scheduler" => EKP.DataMisfitController(), # Adaptive timestepping, "cov_sample_multiplier" => 10.0, # multiplier for samples to estimate covariance in optimization scheme "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation "train_fraction" => 0.8, # 80:20 train - test split @@ -335,10 +337,20 @@ function build_models!( n_ensemble = optimizer_options["n_ensemble"] n_iteration = optimizer_options["n_iteration"] opt_verbose_flag = optimizer_options["verbose"] + scheduler = optimizer_options["scheduler"] + initial_params = construct_initial_ensemble(rng, prior, n_ensemble) min_complexity = log(det(regularization)) data = vcat(get_outputs(io_pairs_opt)[(n_train + 1):end], 0.0, min_complexity) - ekiobj = EKP.EnsembleKalmanProcess(initial_params, data, Γ, Inversion(), rng = rng, verbose = opt_verbose_flag) + ekiobj = EKP.EnsembleKalmanProcess( + initial_params, + data, + Γ, + Inversion(), + scheduler = scheduler, + rng = rng, + verbose = opt_verbose_flag, + ) err = zeros(n_iteration) # [4.] optimize with EKP @@ -362,10 +374,15 @@ function build_models!( ) inflation = optimizer_options["inflation"] if inflation > 0 - EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, use_prior_cov = true, s = inflation) # small regularizing inflation + terminated = + EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, use_prior_cov = true, s = inflation) # small regularizing inflation else - EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation + terminated = EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation end + if !isnothing(terminated) + break # if the timestep was terminated due to timestepping condition + end + err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2) end @@ -373,6 +390,26 @@ function build_models!( # [5.] extract optimal hyperparameters hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:] + if opt_verbose_flag + names = get_name(prior) + hp_optimal_batch = [hp_optimal[b] for b in batch(prior)] + hp_optimal_range = + [(minimum(hp_optimal_batch[i]), maximum(hp_optimal_batch[i])) for i in 1:length(hp_optimal_batch)] #the min and max of the hparams + prior_conf_interval = [mean(prior) .- 3 * sqrt.(var(prior)), mean(prior) .+ 3 * sqrt.(var(prior))] + pci_constrained = [transform_unconstrained_to_constrained(prior, prior_conf_interval[i]) for i in 1:2] + pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])] + pcic_batched = [pcic[b][1] for b in batch(prior)] + @info("EKI Optimization result:") + println( + display( + [ + "name" "number of hyperparameters" "optimized value range" "99% prior mass" + names length.(hp_optimal_batch) hp_optimal_range pcic_batched + ], + ), + ) + end + io_pairs_i = PairedDataContainer(input_values, reshape(output_values[i, :], 1, size(output_values, 2))) # Now, fit new RF model with the optimized hyperparameters rfm_i = RFM_from_hyperparameters( @@ -407,14 +444,16 @@ $(DocStringExtensions.TYPEDSIGNATURES) Prediction of data observation (not latent function) at new inputs (passed in as columns in a matrix). That is, we add the observational noise into predictions. """ -function predict(srfi::ScalarRandomFeatureInterface, new_inputs::MM) where {MM <: AbstractMatrix} +function predict( + srfi::ScalarRandomFeatureInterface, + new_inputs::MM; + multithread = "ensemble", +) where {MM <: AbstractMatrix} M = length(get_rfms(srfi)) N_samples = size(new_inputs, 2) # Predicts columns of inputs: input_dim × N_samples μ = zeros(M, N_samples) σ2 = zeros(M, N_samples) - optimizer_options = get_optimizer_options(srfi) - multithread = optimizer_options["multithread"] if multithread == "ensemble" tullio_threading = false elseif multithread == "tullio" diff --git a/src/VectorRandomFeature.jl b/src/VectorRandomFeature.jl index 20eb9f2de..126672c51 100644 --- a/src/VectorRandomFeature.jl +++ b/src/VectorRandomFeature.jl @@ -145,15 +145,17 @@ Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for - `rng = Random.GLOBAL_RNG` - random number generator - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available - `optimizer_options = nothing` - Dict of options to pass into EKI optimization of hyperparameters (defaults created in `VectorRandomFeatureInterface` constructor): - - "prior": the prior for the hyperparameter optimization + - "prior": the prior for the hyperparameter optimization + - "prior_in_scale"/"prior_out_scale": use these to tune the input/output prior scale. - "n_ensemble": number of ensemble members - "n_iteration": number of eki iterations + - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) - "tikhonov": tikhonov regularization parameter if > 0 - "inflation": additive inflation ∈ [0,1] with 0 being no inflation - "train_fraction": e.g. 0.8 (default) means 80:20 train - test split - "multithread": how to multithread. "ensemble" (default) threads across ensemble members "tullio" threads random feature matrix algebra - - "verbose" => false, verbose optimizer statements + - "verbose" => false, verbose optimizer statements to check convergence, priors and optimal parameters. """ function VectorRandomFeatureInterface( @@ -174,13 +176,22 @@ function VectorRandomFeatureInterface( regularization = Vector{Union{Matrix, UniformScaling, Nothing}}(undef, 0) #Optimization Defaults - n_hp = calculate_n_hyperparameters(input_dim, output_dim, diagonalize_input, diagonalize_output) - prior = build_default_prior(input_dim, output_dim, diagonalize_input, diagonalize_output) + prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0 + prior_out_scale = "prior_out_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_out_scale"] : 1.0 + prior = build_default_prior( + input_dim, + output_dim, + diagonalize_input, + diagonalize_output, + in_scale = prior_in_scale, + out_scale = prior_out_scale, + ) optimizer_opts = Dict( - "prior" => prior, #the hyperparameter_prior + "prior" => prior, #the hyperparameter_prior (note scalings have already been applied) "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble "n_iteration" => 5, # number of eki iterations + "scheduler" => EKP.DataMisfitController(), # Adaptive timestepping "cov_sample_multiplier" => 10.0, # multiplier for samples to estimate covariance in optimization scheme "tikhonov" => 0, # tikhonov regularization parameter if >0 "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation @@ -448,14 +459,22 @@ function build_models!( n_ensemble = optimizer_options["n_ensemble"] # minimal ensemble size n_hp, n_iteration = optimizer_options["n_iteration"] opt_verbose_flag = optimizer_options["verbose"] - + scheduler = optimizer_options["scheduler"] if !isposdef(Γ) Γ = posdef_correct(Γ) end initial_params = construct_initial_ensemble(rng, prior, n_ensemble) - ekiobj = EKP.EnsembleKalmanProcess(initial_params, data[:], Γ, Inversion(), rng = rng, verbose = opt_verbose_flag) + ekiobj = EKP.EnsembleKalmanProcess( + initial_params, + data[:], + Γ, + Inversion(), + scheduler = scheduler, + rng = rng, + verbose = opt_verbose_flag, + ) err = zeros(n_iteration) # [4.] optimize with EKP @@ -489,10 +508,15 @@ function build_models!( end inflation = optimizer_options["inflation"] if inflation > 0 - EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, use_prior_cov = true, s = inflation) # small regularizing inflation + terminated = + EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, use_prior_cov = true, s = inflation) # small regularizing inflation else - EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation + terminated = EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation + end + if !isnothing(terminated) + break # if the timestep was terminated due to timestepping condition end + err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2) end @@ -500,7 +524,25 @@ function build_models!( # [5.] extract optimal hyperparameters hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:] # Now, fit new RF model with the optimized hyperparameters - + if opt_verbose_flag + names = get_name(prior) + hp_optimal_batch = [hp_optimal[b] for b in batch(prior)] + hp_optimal_range = + [(minimum(hp_optimal_batch[i]), maximum(hp_optimal_batch[i])) for i in 1:length(hp_optimal_batch)] #the min and max of the hparams + prior_conf_interval = [mean(prior) .- 3 * sqrt.(var(prior)), mean(prior) .+ 3 * sqrt.(var(prior))] + pci_constrained = [transform_unconstrained_to_constrained(prior, prior_conf_interval[i]) for i in 1:2] + pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])] + pcic_batched = [pcic[b][1] for b in batch(prior)] + @info("EKI Optimization result:") + println( + display( + [ + "name" "number of hyperparameters" "optimized value range" "99% prior mass" + names length.(hp_optimal_batch) hp_optimal_range pcic_batched + ], + ), + ) + end rfm_tmp = RFM_from_hyperparameters( vrfi, rng,