From f0a409191627029561847476b5093c97118a11ee Mon Sep 17 00:00:00 2001
From: odunbar <odunbar@caltech.edu>
Date: Mon, 5 Jun 2023 03:51:50 -0700
Subject: [PATCH] update to MIN ekp requirement of 1.1

add MT option to predict, and add schedulers in optimizer options, as well as early termination

bugfix get_logpdf -> logpdf

add scaling param for default prior

remove unused function call

allow scaling of prior

New Lorenz example setup, compat with new timestepping

learning compat with new scaling

inc all cases

aspect ratio and printing

scalings for decent recovery

 julia format

format
---
 Project.toml                                  |   4 +-
 .../scalar_optimize_and_plot_RF.jl            |  10 +-
 .../vector_optimize_and_plot_RF.jl            |  27 ++++-
 examples/Lorenz/GModel_common.jl              |  71 ++++++-----
 examples/Lorenz/calibrate.jl                  | 112 +++++++++++++-----
 examples/Lorenz/emulate_sample.jl             |  27 ++++-
 src/Emulator.jl                               |   5 +-
 src/MarkovChainMonteCarlo.jl                  |   4 +-
 src/RandomFeature.jl                          |  62 ++++++++--
 src/ScalarRandomFeature.jl                    |  59 +++++++--
 src/VectorRandomFeature.jl                    |  62 ++++++++--
 11 files changed, 328 insertions(+), 115 deletions(-)

diff --git a/Project.toml b/Project.toml
index d462a684c..47cead28d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -29,14 +29,14 @@ AbstractMCMC = "3.3, 4"
 AdvancedMH = "0.6, 0.7"
 Conda = "1.7"
 Distributions = "0.24, 0.25"
+EnsembleKalmanProcesses = "1.1"
 DocStringExtensions = "0.8, 0.9"
-EnsembleKalmanProcesses = "0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 1.0"
 GaussianProcesses = "0.12"
 MCMCChains = "4.14, 5, 6"
 PyCall = "1.93"
+RandomFeatures = "0.3"
 ScikitLearn = "0.6, 0.7"
 StatsBase = "0.33, 0.34"
-RandomFeatures = "0.3"
 julia = "1.6, 1.7, 1.8"
 
 [extras]
diff --git a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl
index d202f0391..b379ff689 100644
--- a/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl
+++ b/examples/Emulator/RandomFeature/scalar_optimize_and_plot_RF.jl
@@ -126,8 +126,14 @@ end
 
 # setup random features
 n_features = 180
-
-srfi = ScalarRandomFeatureInterface(n_features, p, diagonalize_input = diagonalize_input)
+optimizer_options = Dict("n_iteration" => 20, "prior_in_scale" => 0.1, "verbose" => true) #"Max" iterations (may do less)
+
+srfi = ScalarRandomFeatureInterface(
+    n_features,
+    p,
+    diagonalize_input = diagonalize_input,
+    optimizer_options = optimizer_options,
+)
 emulator = Emulator(srfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true)
 println("build RF with $n training points and $(n_features) random features.")
 
diff --git a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl
index 05e4f348f..08d045c44 100644
--- a/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl
+++ b/examples/Emulator/RandomFeature/vector_optimize_and_plot_RF.jl
@@ -133,18 +133,37 @@ function main()
 
         # setup random features
         n_features = 200
+        if case ∈ ["svd-diag", "svd-nondiag"]
+            optimizer_options =
+                Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 1.0, "verbose" => true) #"Max" iterations (may do less)
+        else # without svd 
+            optimizer_options =
+                Dict("n_iteration" => 20, "prior_in_scale" => 0.01, "prior_out_scale" => 0.01, "verbose" => true) #"Max" iterations (may do less)
+        end
 
         if case == "svd-diag"
-            vrfi = VectorRandomFeatureInterface(n_features, p, d, diagonalize_output = true)
+            vrfi = VectorRandomFeatureInterface(
+                n_features,
+                p,
+                d,
+                diagonalize_output = true,
+                optimizer_options = optimizer_options,
+            )
             emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true)
         elseif case == "svd-nondiag"
-            vrfi = VectorRandomFeatureInterface(n_features, p, d)
+            vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options)
             emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true)
         elseif case == "nosvd-diag"
-            vrfi = VectorRandomFeatureInterface(n_features, p, d, diagonalize_output = true)
+            vrfi = VectorRandomFeatureInterface(
+                n_features,
+                p,
+                d,
+                diagonalize_output = true,
+                optimizer_options = optimizer_options,
+            )
             emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false)
         elseif case == "nosvd-nondiag"
-            vrfi = VectorRandomFeatureInterface(n_features, p, d)
+            vrfi = VectorRandomFeatureInterface(n_features, p, d, optimizer_options = optimizer_options)
             emulator = Emulator(vrfi, iopairs, obs_noise_cov = Σ, normalize_inputs = true, decorrelate = false)
         end
         println("build RF with $n training points and $(n_features) random features.")
diff --git a/examples/Lorenz/GModel_common.jl b/examples/Lorenz/GModel_common.jl
index 2f5993f98..d3a99de5c 100644
--- a/examples/Lorenz/GModel_common.jl
+++ b/examples/Lorenz/GModel_common.jl
@@ -128,22 +128,25 @@ end
 # tend: End of simulation Float64(1), nstep: 
 function lorenz_solve(settings::LSettings, params::LParams)
     # Initialize
-    nstep = Int32(ceil(settings.tend / settings.dt))
+    nstep = Int32(ceil((settings.tend - settings.t_start) / settings.dt))
     xn = zeros(Float64, settings.N, nstep)
     t = zeros(Float64, nstep)
     # Initial perturbation
     X = fill(Float64(0.0), settings.N)
     X = X + settings.Fp
+    Xbuffer = zeros(4, settings.N)
     # March forward in time
     for j in 1:nstep
-        t[j] = settings.dt * j
+        t[j] = settings.t_start + settings.dt * j
+        #use view to update a slice
+        # RK4! modifies first and last arguments
         if settings.dynamics == 1
-            X = RK4(X, settings.dt, settings.N, params.F)
+            RK4!(X, settings.dt, settings.N, params.F, Xbuffer)
         elseif settings.dynamics == 2
             F_local = params.F + params.A * sin(params.ω * t[j])
-            X = RK4(X, settings.dt, settings.N, F_local)
+            RK4!(X, settings.dt, settings.N, F_local, Xbuffer)
         end
-        xn[:, j] = X
+        xn[:, j] += X
     end
     # Output
     return xn, t
@@ -151,33 +154,38 @@ function lorenz_solve(settings::LSettings, params::LParams)
 end
 
 # Lorenz 96 system
-# f = dx/dt
+# f = dx/dt overwriting first argument
 # Inputs: x: state, N: longitude steps, F: forcing
-function f(x, N, F)
-    f = zeros(Float64, N)
+function f!(xnew, x, N, F)
+
     # Loop over N positions
     for i in 3:(N - 1)
-        f[i] = -x[i - 2] * x[i - 1] + x[i - 1] * x[i + 1] - x[i] + F
+        xnew[i] = -x[i - 2] * x[i - 1] + x[i - 1] * x[i + 1] - x[i] + F
     end
     # Periodic boundary conditions
-    f[1] = -x[N - 1] * x[N] + x[N] * x[2] - x[1] + F
-    f[2] = -x[N] * x[1] + x[1] * x[3] - x[2] + F
-    f[N] = -x[N - 2] * x[N - 1] + x[N - 1] * x[1] - x[N] + F
+    xnew[1] = -x[N - 1] * x[N] + x[N] * x[2] - x[1] + F
+    xnew[2] = -x[N] * x[1] + x[1] * x[3] - x[2] + F
+    xnew[N] = -x[N - 2] * x[N - 1] + x[N - 1] * x[1] - x[N] + F
     # Output
-    return f
+    return nothing
 end
 
-# RK4 solve
-function RK4(xold, dt, N, F)
-    # Predictor steps
-    k1 = f(xold, N, F)
-    k2 = f(xold + k1 * dt / 2.0, N, F)
-    k3 = f(xold + k2 * dt / 2.0, N, F)
-    k4 = f(xold + k3 * dt, N, F)
+# RK4 solve, overwriting first argument
+function RK4!(xold, dt, N, F, buffer)
+    #buffer is 4 x N zeros
+    @assert size(buffer) == (4, N)
+
+    # Predictor steps updates the final argument
+    # must use view to update a slice in-place
+    f!(view(buffer, 1, :), xold, N, F) #k1
+    f!(view(buffer, 2, :), xold + buffer[1, :] * dt / 2.0, N, F) #k2
+    f!(view(buffer, 3, :), xold + buffer[2, :] * dt / 2.0, N, F) #k3
+    f!(view(buffer, 4, :), xold + buffer[3, :] * dt, N, F) #k4
     # Step
-    xnew = xold + (dt / 6.0) * (k1 + 2.0 * k2 + 2.0 * k3 + k4)
-    # Output
-    return xnew
+    #xnew = xold + (dt / 6.0) * (k1 + 2.0 * k2 + 2.0 * k3 + k4)
+    xold .+= (dt / 6.0) * (buffer[1, :] + 2.0 * buffer[2, :] + 2.0 * buffer[3, :] + buffer[4, :])
+
+    return nothing
 end
 
 function regression(X, y)
@@ -216,7 +224,8 @@ end
 ###############################
 function stats(settings, xn, t)
     # Define averaging indices range
-    indices = findall(x -> (x > settings.t_start) && (x < settings.t_start + settings.T), t)
+    indices = findall(x -> (x > settings.t_start) && (x < settings.tend), t)
+
     # Define statistics of interest
     if settings.stats_type == 1 # Mean
         # Average in time and over longitude
@@ -317,25 +326,15 @@ function stats(settings, xn, t)
         var_slide = zeros(N)
         t_slide = zeros(nt, N)
         for i in 1:N
-            var_slide[i] = var(vcat(xn[:, indices[((i - 1) * nt + 1):(i * nt)]]...))
+            var_slide[i] = var(xn[:, indices[((i - 1) * nt + 1):(i * nt)]])
             t_slide[:, i] = t[((i - 1) * nt + 1):(i * nt)]
         end
-        # Polynomial fit over a batch of sliding windows
-        Tfit = Int64(settings.Tfit)
+        Tfit = Int64(settings.Tfit) # (a multiple of Ts)
         NT = Int64(N / Tfit)
-        a1 = zeros(NT)
-        a2 = zeros(NT)
-        t_start = zeros(NT)
-        y = zeros(Tfit, NT)
-        ty = zeros(Tfit, NT)
         ym = zeros(NT)
         for i in 1:NT
             ind_low = (i - 1) * Tfit + 1
             ind_high = i * Tfit
-            t_start[i] = t_slide[1, ind_low]
-            ty[:, i] = t_slide[1, ind_low:ind_high] .- t_start[i]
-            a1[i], a2[i] = regression(ty[:, i], var_slide[ind_low:ind_high])
-            y[:, i] = a1[i] .* ty[:, i] .+ a2[i]
             ym[i] = mean(var_slide[ind_low:ind_high])
         end
         # Combine
diff --git a/examples/Lorenz/calibrate.jl b/examples/Lorenz/calibrate.jl
index b337a653d..a799899a0 100644
--- a/examples/Lorenz/calibrate.jl
+++ b/examples/Lorenz/calibrate.jl
@@ -18,8 +18,7 @@ const PD = EKP.ParameterDistributions
 function main()
 
     rng_seed = 4137
-    Random.seed!(rng_seed)
-
+    rng = Random.MersenneTwister(rng_seed)
     # Output figure save directory
     homedir = pwd()
     println(homedir)
@@ -39,11 +38,11 @@ function main()
     dynamics = 2 # Transient is 2
     # Statistics integration length
     # This has to be less than 360 and 360 must be divisible by Ts_days
-    Ts_days = 30.0 # Integration length in days
+    Ts_days = 30.0 # Integration length in days (will be made non-dimensional later)
     # Stats type, which statistics to construct from the L96 system
     # 4 is a linear fit over a batch of length Ts_days
-    # 5 is the mean over a batch of length Ts_days
     stats_type = 5
+    # 5 is the mean over batches of length Ts_days:
 
 
     ###
@@ -52,7 +51,7 @@ function main()
     # Define the parameters that we want to learn
     F_true = 8.0 # Mean F
     A_true = 2.5 # Transient F amplitude
-    ω_true = 2.0 * π / (360.0 / τc) # Frequency of the transient F
+    ω_true = 2.0 * π / (360.0 / τc) # Frequency of the transient F (non-dim)
 
     if dynamics == 2
         params_true = [F_true, A_true]
@@ -99,6 +98,7 @@ function main()
     dt = 1 / 64.0
     # Start of integration
     t_start = 800.0
+    # We now rescale all the timings etc, to be in nondim-time
     # Data collection length
     if dynamics == 1
         T = 2.0
@@ -117,10 +117,12 @@ function main()
     # Use CES to learn ω?
     ω_fixed = true
 
-    # Settings
+    # Note, the initial conditions are given by Fp, and the integration is run from this
+    # one can change the ICs of a simulation therefore by changing Fp
     # Constructs an LSettings structure, see GModel.jl for the descriptions
     lorenz_settings =
         GModel.LSettings(dynamics, stats_type, t_start, T, Ts, Tfit, Fp, N, dt, t_start + T, kmax, ω_fixed, ω_true)
+    println(lorenz_settings)
     lorenz_params = GModel.LParams(F_true, ω_true, A_true)
 
     ###
@@ -140,38 +142,52 @@ function main()
     if var_prescribe == true
         n_samples = 100
         yt = zeros(length(gt), n_samples)
-        noise_level = 0.05
-        Γy = noise_level * convert(Array, Diagonal(gt))
+        noise_sd = 0.25
+        #        Γy = noise_sd^2 * convert(Array, Diagonal(gt))
+        Γy = noise_sd^2 * convert(Array, Diagonal(ones(length(gt)) * mean(gt)))
         μ = zeros(length(gt))
         # Add noise
         for i in 1:n_samples
             yt[:, i] = gt .+ rand(MvNormal(μ, Γy))
         end
+        println(Γy)
+
     else
         println("Using truth values to compute covariance")
-        n_samples = 20
+        n_samples = 100
         nthreads = Threads.nthreads()
         yt = zeros(nthreads, length(gt), n_samples)
         Threads.@threads for i in 1:n_samples
-            lorenz_settings_local = GModel.LSettings(
-                dynamics,
-                stats_type,
-                t_start + T * (i - 1),
-                T,
-                Ts,
-                Tfit,
-                Fp,
-                N,
-                dt,
-                t_start + T * (i - 1) + T,
-                kmax,
-                ω_fixed,
-                ω_true,
-            )
+
+            #=
+                        lorenz_settings_local = GModel.LSettings(
+                            dynamics,
+                            stats_type,
+                            t_start + T * (i - 1),
+                            T,
+                            Ts,
+                            Tfit,
+                            Fp,
+                            N,
+                            dt,
+                            t_start + T * (i - 1) + T,
+                            kmax,
+                            ω_fixed,
+                            ω_true,
+                        )=#
+            lorenz_settings_local = lorenz_settings
             tid = Threads.threadid()
             yt[tid, :, i] = GModel.run_G_ensemble(params_true, lorenz_settings_local)
         end
         yt = dropdims(sum(yt, dims = 1), dims = 1)
+
+        # add a little extra variance for regularization here
+        noise_sd = 0.1
+        Γy_diag = noise_sd .^ 2 * convert(Array, I(size(yt, 1)))
+        μ = zeros(size(yt, 1))
+        # Add noise
+        yt += rand(MvNormal(μ, Γy_diag), size(yt, 2))
+
         # Variance of truth data
         #Γy = convert(Array, Diagonal(dropdims(mean((yt.-mean(yt,dims=1)).^2,dims=1),dims=1)))
         # Covariance of truth data
@@ -181,9 +197,10 @@ function main()
     end
 
 
+
     # Construct observation object
     truth = Observations.Observation(yt, Γy, data_names)
-    truth_sample = truth.mean
+    truth_sample = yt[:, end]
     ###
     ###  Calibrate: Ensemble Kalman Inversion
     ###
@@ -192,23 +209,36 @@ function main()
     lorenz_settings_G = lorenz_settings # initialize to truth settings
 
     # EKP parameters
-    N_ens = 20 # number of ensemble members
-    N_iter = 6 # number of EKI iterations
+    N_ens = 30 # number of ensemble members
+    N_iter = 20 # number of EKI iterations
     # initial parameters: N_params x N_ens
-    initial_params = EKP.construct_initial_ensemble(priors, N_ens; rng_seed = rng_seed)
+    initial_params = EKP.construct_initial_ensemble(rng, priors, N_ens)
 
-    ekiobj = EKP.EnsembleKalmanProcess(initial_params, truth_sample, truth.obs_noise_cov, EKP.Inversion())
+    ekiobj = EKP.EnsembleKalmanProcess(
+        initial_params,
+        truth_sample,
+        truth.obs_noise_cov,
+        EKP.Inversion(),
+        scheduler = EKP.DataMisfitController(),
+        verbose = true,
+    )
 
     # EKI iterations
     println("EKP inversion error:")
     err = zeros(N_iter)
+    final_iter = [N_iter]
     for i in 1:N_iter
         params_i = EKP.get_ϕ_final(priors, ekiobj) # the `ϕ` indicates that the `params_i` are in the constrained space    
         g_ens = GModel.run_G_ensemble(params_i, lorenz_settings_G)
-        EKP.update_ensemble!(ekiobj, g_ens)
+        terminated = EKP.update_ensemble!(ekiobj, g_ens)
+        if !isnothing(terminated)
+            final_iter = i - 1 # final update was previous iteration
+            break
+        end
         err[i] = EKP.get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
         println("Iteration: " * string(i) * ", Error: " * string(err[i]))
     end
+    N_iter = final_iter[1] #in case it terminated early
 
     # EKI results: Has the ensemble collapsed toward the truth?
     println("True parameters: ")
@@ -237,6 +267,28 @@ function main()
         "truth_input_constrained",
         params_true, #constrained here, as these are in a physically constrained space (unlike the u inputs),
     )
+
+    # plots in constrained space
+    gr(dpi = 300, size = (400, 400))
+    ϕ_init = EKP.get_ϕ(priors, ekiobj, 1)
+    p = plot(
+        title = "EKI iterations",
+        xlims = extrema(ϕ_init[1, :]),
+        xaxis = "F",
+        yaxis = "A",
+        ylims = extrema(ϕ_init[2, :]),
+    )
+    for i in 1:N_iter
+        ϕ_i = EKP.get_ϕ(priors, ekiobj, i)
+        scatter!(p, ϕ_i[1, :], ϕ_i[2, :], color = :grey, label = false)
+    end
+    ϕ_fin = EKP.get_ϕ_final(priors, ekiobj)
+    scatter!(p, ϕ_fin[1, :], ϕ_fin[2, :], color = :magenta, label = false)
+
+    vline!(p, [params_true[1]], linestyle = :dash, linecolor = :red, label = false)
+    hline!(p, [params_true[2]], linestyle = :dash, linecolor = :red, label = false)
+    savefig(p, joinpath(figure_save_directory, "calibration.png"))
+    savefig(p, joinpath(figure_save_directory, "calibration.pdf"))
 end
 
 main()
diff --git a/examples/Lorenz/emulate_sample.jl b/examples/Lorenz/emulate_sample.jl
index 48a304f6c..7ea16c67c 100644
--- a/examples/Lorenz/emulate_sample.jl
+++ b/examples/Lorenz/emulate_sample.jl
@@ -47,13 +47,23 @@ function main()
 
     #### CHOOSE YOUR CASE: 
     mask = 2:7
-    for case in cases[mask]
-
+    # One day we can use good heuristics here
+    # Currently set so that learnt hyperparameters stays relatively far inside the prior. (use "verbose" => true in optimizer overrides to see this info)
+    prior_scalings = [[0.0, 0.0], [1e-2, 0.0], [1e-1, 0.0], [1e-2, 1e-2], [1e-2, 1e-1], [1e-2, 1e-2], [1e-2, 1e-2]]
+    for (case, scaling) in zip(cases[mask], prior_scalings[mask])
 
         #case = cases[7]
         println("case: ", case)
-
-        overrides = Dict("train_fraction" => 0.6, "verbose" => true)
+        max_iter = 6 # number of EKP iterations to use data from is at most this
+        overrides = Dict(
+            "verbose" => true,
+            "train_fraction" => 0.8,
+            "n_iteration" => 40,
+            "scheduler" => DataMisfitController(),
+            "prior_in_scale" => scaling[1],
+            "prior_out_scale" => scaling[2],
+        )
+        # we do not want termination, as our priors have relatively little interpretation
 
         # Should be loaded:
         τc = 5.0
@@ -88,6 +98,7 @@ function main()
         truth_params_constrained = load(data_save_file)["truth_input_constrained"] #true parameters in constrained space
         truth_params = transform_constrained_to_unconstrained(priors, truth_params_constrained)
         Γy = ekiobj.obs_noise_cov
+        println(Γy)
 
         n_params = length(truth_params) # "input dim"
         output_dim = size(Γy, 1)
@@ -120,6 +131,7 @@ function main()
             # do we want to assume that the outputs are decorrelated in the machine-learning problem?
             diagonalize_output = case ∈ ["RF-vector-svd-diag", "RF-vector-nosvd-diag"] ? true : false
             n_features = 300
+
             mlt = VectorRandomFeatureInterface(
                 n_features,
                 n_params,
@@ -141,18 +153,21 @@ function main()
         println(norm_factor)
 
         # Get training points from the EKP iteration number in the second input term  
-        N_iter = 6 # note we have 6 iterations stored
+        N_iter = min(max_iter, length(get_u(ekiobj)) - 1) # number of paired iterations taken from EKP
+
         input_output_pairs = Utilities.get_training_points(ekiobj, N_iter - 1) # 1:N-1
+
         input_output_pairs_test = Utilities.get_training_points(ekiobj, N_iter:N_iter) # just "next" iteration used for testing
         # Save data
         @save joinpath(data_save_directory, "input_output_pairs.jld2") input_output_pairs
 
-        standardize = true
+        standardize = false
         retained_svd_frac = 1.0
         normalized = true
         # do we want to use SVD to decorrelate outputs
         decorrelate = case ∈ ["RF-vector-nosvd-diag", "RF-vector-nosvd-nondiag"] ? false : true
 
+
         emulator = Emulator(
             mlt,
             input_output_pairs;
diff --git a/src/Emulator.jl b/src/Emulator.jl
index 6f4d2572d..4eeb360d1 100644
--- a/src/Emulator.jl
+++ b/src/Emulator.jl
@@ -40,7 +40,7 @@ end
 function optimize_hyperparameters!(mlt)
     throw_define_mlt()
 end
-function predict(mlt, new_inputs)
+function predict(mlt, new_inputs; mlt_kwargs...)
     throw_define_mlt()
 end
 
@@ -187,6 +187,7 @@ function predict(
     new_inputs::AbstractMatrix{FT};
     transform_to_real = false,
     vector_rf_unstandardize = true,
+    mlt_kwargs...
 ) where {FT <: AbstractFloat}
     # Check if the size of new_inputs is consistent with the GP model's input
     # dimension. 
@@ -200,7 +201,7 @@ function predict(
     normalized_new_inputs = normalize(emulator, new_inputs)
 
     # [2.]  predict. Note: ds = decorrelated, standard
-    ds_outputs, ds_output_var = predict(emulator.machine_learning_tool, normalized_new_inputs)
+    ds_outputs, ds_output_var = predict(emulator.machine_learning_tool, normalized_new_inputs, mlt_kwargs...)
 
     # [3.] transform back to real coordinates or remain in decorrelated coordinates
     if !isa(get_machine_learning_tool(emulator), VectorRandomFeatureInterface)
diff --git a/src/MarkovChainMonteCarlo.jl b/src/MarkovChainMonteCarlo.jl
index 2e376b32e..6faa986d0 100644
--- a/src/MarkovChainMonteCarlo.jl
+++ b/src/MarkovChainMonteCarlo.jl
@@ -170,9 +170,9 @@ function EmulatorPosteriorModel(
             #TODO vector_rf will always unstandardize, but other methods will not, so we require this additional flag.
 
             if isa(g_cov[1], Real)
-                return logpdf(MvNormal(obs_sample, g_cov[1] * I), vec(g)) + get_logpdf(prior, θ)
+                return logpdf(MvNormal(obs_sample, g_cov[1] * I), vec(g)) + logpdf(prior, θ)
             else
-                return logpdf(MvNormal(obs_sample, g_cov[1]), vec(g)) + get_logpdf(prior, θ)
+                return logpdf(MvNormal(obs_sample, g_cov[1]), vec(g)) + logpdf(prior, θ)
             end
 
         end,
diff --git a/src/RandomFeature.jl b/src/RandomFeature.jl
index 3465d2c0d..f72cbf2f3 100644
--- a/src/RandomFeature.jl
+++ b/src/RandomFeature.jl
@@ -8,6 +8,8 @@ using ..Utilities
 using StableRNGs
 using Random
 
+export calculate_n_hyperparameters, build_default_prior
+
 abstract type RandomFeatureInterface <: MachineLearningTool end
 
 abstract type MultithreadType end
@@ -94,51 +96,89 @@ $(DocStringExtensions.TYPEDSIGNATURES)
 Builds a prior over the hyperparameters (i.e. the cholesky/diagaonal or individaul entries of the input/output covariances).
 For example, the case where the input covariance ``U = γ_1 * (LL^T + γ_2 I)``, 
 we set priors for the entries of the lower triangular matrix  ``L`` as normal, and constant scalings ``γ_i`` as log-normal to retain positivity.
+
+priors can be scaled by a constant factor by using the in_scale and out_scale keywords
 """
-function build_default_prior(input_dim::Int, output_dim::Int, diagonalize_input::Bool, diagonalize_output::Bool)
+function build_default_prior(
+    input_dim::Int,
+    output_dim::Int,
+    diagonalize_input::Bool,
+    diagonalize_output::Bool;
+    in_scale = 1.0,
+    out_scale = 1.0,
+)
     n_hp_in = n_hyperparameters_cov(input_dim, diagonalize_input)
     n_hp_out = n_hyperparameters_cov(output_dim, diagonalize_output)
 
+    # aspect ratio for mean:sd for positive parameters in constrained_gaussian()
+    # 10 seems to give a decent range, this isn't very tuneable so the alternative would be to use the basic constructor 
+    pos_asp_ratio = 10
     # if dim = 1 , positive constant
     # elseif diagonalized, positive on diagonal and positive scalings
     # else chol factor, and positive scalings
     if input_dim > 1
         if diagonalize_input
-            param_diag = constrained_gaussian("input_prior_diagonal", 1.0, 1.0, 0.0, Inf, repeats = n_hp_in - 2)
-            param_scaling = constrained_gaussian("input_prior_scaling", 1.0, 1.0, 0.0, Inf, repeats = 2)
+            param_diag =
+                constrained_gaussian("input_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_in - 2)
+            param_scaling =
+                constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2)
             input_prior = combine_distributions([param_diag, param_scaling])
         else
             param_chol = constrained_gaussian("input_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_in - 2)
-            param_scaling = constrained_gaussian("input_prior_scaling", 1.0, 1.0, 0.0, Inf, repeats = 2)
+            param_scaling =
+                constrained_gaussian("input_prior_scaling", in_scale, in_scale * pos_asp_ratio, 0.0, Inf, repeats = 2)
             input_prior = combine_distributions([param_chol, param_scaling])
         end
     else
-        input_prior = constrained_gaussian("input_prior", 1.0, 1.0, 0.0, Inf)
+        input_prior = constrained_gaussian("input_prior", in_scale, in_scale, 0.0, Inf)
     end
 
     if output_dim > 1
         if diagonalize_output
-            param_diag = constrained_gaussian("output_prior_diagonal", 1.0, 1.0, 0.0, Inf, repeats = n_hp_out - 2)
-            param_scaling = constrained_gaussian("output_prior_scaling", 1.0, 1.0, 0.0, Inf, repeats = 2)
+            param_diag =
+                constrained_gaussian("output_prior_diagonal", 1.0, pos_asp_ratio, 0.0, Inf, repeats = n_hp_out - 2)
+            param_scaling = constrained_gaussian(
+                "output_prior_scaling",
+                out_scale,
+                out_scale * pos_asp_ratio,
+                0.0,
+                Inf,
+                repeats = 2,
+            )
             output_prior = combine_distributions([param_diag, param_scaling])
         else
             param_chol = constrained_gaussian("output_prior_cholesky", 0.0, 1.0, -Inf, Inf, repeats = n_hp_out - 2)
-            param_scaling = constrained_gaussian("output_prior_scaling", 1.0, 1.0, 0.0, Inf, repeats = 2)
+            param_scaling = constrained_gaussian(
+                "output_prior_scaling",
+                out_scale,
+                out_scale * pos_asp_ratio,
+                0.0,
+                Inf,
+                repeats = 2,
+            )
             output_prior = combine_distributions([param_chol, param_scaling])
         end
     else
-        output_prior = constrained_gaussian("output_prior", 1.0, 1.0, 0.0, Inf)
+        output_prior = constrained_gaussian("output_prior", out_scale, 10 * out_scale, 0.0, Inf)
     end
 
     return combine_distributions([input_prior, output_prior])
 
 end
 
-function build_default_prior(input_dim::Int, diagonalize_input::Bool)
+function build_default_prior(input_dim::Int, diagonalize_input::Bool, in_scale = 1.0)
     #scalar case consistent with vector case
     output_dim = 1
     diagonalize_output = false
-    return build_default_prior(input_dim, output_dim, diagonalize_input, diagonalize_output)
+    out_scale = 1.0
+    return build_default_prior(
+        input_dim,
+        output_dim,
+        diagonalize_input,
+        diagonalize_output,
+        in_scale = in_scale,
+        out_scale = out_scale,
+    )
 end
 
 """
diff --git a/src/ScalarRandomFeature.jl b/src/ScalarRandomFeature.jl
index dbc26d11b..626c7bed9 100644
--- a/src/ScalarRandomFeature.jl
+++ b/src/ScalarRandomFeature.jl
@@ -107,9 +107,11 @@ Constructs a `ScalarRandomFeatureInterface <: MachineLearningTool` interface for
  - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available
  - `optimizer_options = nothing` - Dict of options to pass into EKI optimization of hyperparameters (defaults created in `ScalarRandomFeatureInterface` constructor):
      - "prior":  the prior for the hyperparameter optimization 
+     - "prior_in_scale": use this to tune the input prior scale
      - "n_ensemble":  number of ensemble members
      - "n_iteration":  number of eki iterations
-     - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0)
+     - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0)  
+     - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController
      - "tikhonov":  tikhonov regularization parameter if >0
      - "inflation":  additive inflation ∈ [0,1] with 0 being no inflation
      - "train_fraction":  e.g. 0.8 (default)  means 80:20 train - test split
@@ -129,15 +131,15 @@ function ScalarRandomFeatureInterface(
     rfms = Vector{RF.Methods.RandomFeatureMethod}(undef, 0)
     fitted_features = Vector{RF.Methods.Fit}(undef, 0)
 
-
-    n_hp = calculate_n_hyperparameters(input_dim, diagonalize_input)
-    prior = build_default_prior(input_dim, diagonalize_input)
+    prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0
+    prior = build_default_prior(input_dim, diagonalize_input, prior_in_scale)
 
     # default optimizer settings
     optimizer_opts = Dict(
         "prior" => prior, #the hyperparameter_prior
         "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
         "n_iteration" => 5, # number of eki iterations
+        "scheduler" => EKP.DataMisfitController(), # Adaptive timestepping,
         "cov_sample_multiplier" => 10.0, # multiplier for samples to estimate covariance in optimization scheme
         "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation
         "train_fraction" => 0.8, # 80:20 train - test split
@@ -335,10 +337,20 @@ function build_models!(
         n_ensemble = optimizer_options["n_ensemble"]
         n_iteration = optimizer_options["n_iteration"]
         opt_verbose_flag = optimizer_options["verbose"]
+        scheduler = optimizer_options["scheduler"]
+
         initial_params = construct_initial_ensemble(rng, prior, n_ensemble)
         min_complexity = log(det(regularization))
         data = vcat(get_outputs(io_pairs_opt)[(n_train + 1):end], 0.0, min_complexity)
-        ekiobj = EKP.EnsembleKalmanProcess(initial_params, data, Γ, Inversion(), rng = rng, verbose = opt_verbose_flag)
+        ekiobj = EKP.EnsembleKalmanProcess(
+            initial_params,
+            data,
+            Γ,
+            Inversion(),
+            scheduler = scheduler,
+            rng = rng,
+            verbose = opt_verbose_flag,
+        )
         err = zeros(n_iteration)
 
         # [4.] optimize with EKP
@@ -362,10 +374,15 @@ function build_models!(
             )
             inflation = optimizer_options["inflation"]
             if inflation > 0
-                EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, use_prior_cov = true, s = inflation) # small regularizing inflation
+                terminated =
+                    EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, use_prior_cov = true, s = inflation) # small regularizing inflation
             else
-                EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation
+                terminated = EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation
             end
+            if !isnothing(terminated)
+                break # if the timestep was terminated due to timestepping condition
+            end
+
             err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
 
         end
@@ -373,6 +390,26 @@ function build_models!(
         # [5.] extract optimal hyperparameters
         hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:]
 
+        if opt_verbose_flag
+            names = get_name(prior)
+            hp_optimal_batch = [hp_optimal[b] for b in batch(prior)]
+            hp_optimal_range =
+                [(minimum(hp_optimal_batch[i]), maximum(hp_optimal_batch[i])) for i in 1:length(hp_optimal_batch)] #the min and max of the hparams
+            prior_conf_interval = [mean(prior) .- 3 * sqrt.(var(prior)), mean(prior) .+ 3 * sqrt.(var(prior))]
+            pci_constrained = [transform_unconstrained_to_constrained(prior, prior_conf_interval[i]) for i in 1:2]
+            pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])]
+            pcic_batched = [pcic[b][1] for b in batch(prior)]
+            @info("EKI Optimization result:")
+            println(
+                display(
+                    [
+                        "name" "number of hyperparameters" "optimized value range" "99% prior mass"
+                        names length.(hp_optimal_batch) hp_optimal_range pcic_batched
+                    ],
+                ),
+            )
+        end
+
         io_pairs_i = PairedDataContainer(input_values, reshape(output_values[i, :], 1, size(output_values, 2)))
         # Now, fit new RF model with the optimized hyperparameters
         rfm_i = RFM_from_hyperparameters(
@@ -407,14 +444,16 @@ $(DocStringExtensions.TYPEDSIGNATURES)
 
 Prediction of data observation (not latent function) at new inputs (passed in as columns in a matrix). That is, we add the observational noise into predictions.
 """
-function predict(srfi::ScalarRandomFeatureInterface, new_inputs::MM) where {MM <: AbstractMatrix}
+function predict(
+    srfi::ScalarRandomFeatureInterface,
+    new_inputs::MM;
+    multithread = "ensemble",
+) where {MM <: AbstractMatrix}
     M = length(get_rfms(srfi))
     N_samples = size(new_inputs, 2)
     # Predicts columns of inputs: input_dim × N_samples
     μ = zeros(M, N_samples)
     σ2 = zeros(M, N_samples)
-    optimizer_options = get_optimizer_options(srfi)
-    multithread = optimizer_options["multithread"]
     if multithread == "ensemble"
         tullio_threading = false
     elseif multithread == "tullio"
diff --git a/src/VectorRandomFeature.jl b/src/VectorRandomFeature.jl
index 20eb9f2de..126672c51 100644
--- a/src/VectorRandomFeature.jl
+++ b/src/VectorRandomFeature.jl
@@ -145,15 +145,17 @@ Constructs a `VectorRandomFeatureInterface <: MachineLearningTool` interface for
  - `rng = Random.GLOBAL_RNG` - random number generator 
  - `feature_decomposition = "cholesky"` - choice of how to store decompositions of random features, `cholesky` or `svd` available
  - `optimizer_options = nothing` - Dict of options to pass into EKI optimization of hyperparameters (defaults created in `VectorRandomFeatureInterface` constructor):
-      - "prior": the prior for the hyperparameter optimization 
+      - "prior": the prior for the hyperparameter optimization
+      - "prior_in_scale"/"prior_out_scale": use these to tune the input/output prior scale.
       - "n_ensemble": number of ensemble members
       - "n_iteration": number of eki iterations
+      - "scheduler": Learning rate Scheduler (a.k.a. EKP timestepper) Default: DataMisfitController
       - "cov_sample_multiplier": increase for more samples to estimate covariance matrix in optimization (default 10.0, minimum 1.0) 
       - "tikhonov": tikhonov regularization parameter if > 0
       - "inflation": additive inflation ∈ [0,1] with 0 being no inflation
       - "train_fraction": e.g. 0.8 (default)  means 80:20 train - test split
       - "multithread": how to multithread. "ensemble" (default) threads across ensemble members "tullio" threads random feature matrix algebra
-      - "verbose" => false, verbose optimizer statements
+      - "verbose" => false, verbose optimizer statements to check convergence, priors and optimal parameters.
 
 """
 function VectorRandomFeatureInterface(
@@ -174,13 +176,22 @@ function VectorRandomFeatureInterface(
     regularization = Vector{Union{Matrix, UniformScaling, Nothing}}(undef, 0)
 
     #Optimization Defaults    
-    n_hp = calculate_n_hyperparameters(input_dim, output_dim, diagonalize_input, diagonalize_output)
-    prior = build_default_prior(input_dim, output_dim, diagonalize_input, diagonalize_output)
+    prior_in_scale = "prior_in_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_in_scale"] : 1.0
+    prior_out_scale = "prior_out_scale" ∈ keys(optimizer_options) ? optimizer_options["prior_out_scale"] : 1.0
+    prior = build_default_prior(
+        input_dim,
+        output_dim,
+        diagonalize_input,
+        diagonalize_output,
+        in_scale = prior_in_scale,
+        out_scale = prior_out_scale,
+    )
 
     optimizer_opts = Dict(
-        "prior" => prior, #the hyperparameter_prior
+        "prior" => prior, #the hyperparameter_prior (note scalings have already been applied)
         "n_ensemble" => max(ndims(prior) + 1, 10), #number of ensemble
         "n_iteration" => 5, # number of eki iterations
+        "scheduler" => EKP.DataMisfitController(), # Adaptive timestepping
         "cov_sample_multiplier" => 10.0, # multiplier for samples to estimate covariance in optimization scheme 
         "tikhonov" => 0, # tikhonov regularization parameter if >0
         "inflation" => 1e-4, # additive inflation ∈ [0,1] with 0 being no inflation
@@ -448,14 +459,22 @@ function build_models!(
     n_ensemble = optimizer_options["n_ensemble"] # minimal ensemble size n_hp,
     n_iteration = optimizer_options["n_iteration"]
     opt_verbose_flag = optimizer_options["verbose"]
-
+    scheduler = optimizer_options["scheduler"]
     if !isposdef(Γ)
         Γ = posdef_correct(Γ)
     end
 
     initial_params = construct_initial_ensemble(rng, prior, n_ensemble)
 
-    ekiobj = EKP.EnsembleKalmanProcess(initial_params, data[:], Γ, Inversion(), rng = rng, verbose = opt_verbose_flag)
+    ekiobj = EKP.EnsembleKalmanProcess(
+        initial_params,
+        data[:],
+        Γ,
+        Inversion(),
+        scheduler = scheduler,
+        rng = rng,
+        verbose = opt_verbose_flag,
+    )
     err = zeros(n_iteration)
 
     # [4.] optimize with EKP
@@ -489,10 +508,15 @@ function build_models!(
         end
         inflation = optimizer_options["inflation"]
         if inflation > 0
-            EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, use_prior_cov = true, s = inflation) # small regularizing inflation
+            terminated =
+                EKP.update_ensemble!(ekiobj, g_ens, additive_inflation = true, use_prior_cov = true, s = inflation) # small regularizing inflation
         else
-            EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation
+            terminated = EKP.update_ensemble!(ekiobj, g_ens) # small regularizing inflation
+        end
+        if !isnothing(terminated)
+            break # if the timestep was terminated due to timestepping condition
         end
+
         err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
 
     end
@@ -500,7 +524,25 @@ function build_models!(
     # [5.] extract optimal hyperparameters
     hp_optimal = get_ϕ_mean_final(prior, ekiobj)[:]
     # Now, fit new RF model with the optimized hyperparameters
-
+    if opt_verbose_flag
+        names = get_name(prior)
+        hp_optimal_batch = [hp_optimal[b] for b in batch(prior)]
+        hp_optimal_range =
+            [(minimum(hp_optimal_batch[i]), maximum(hp_optimal_batch[i])) for i in 1:length(hp_optimal_batch)] #the min and max of the hparams
+        prior_conf_interval = [mean(prior) .- 3 * sqrt.(var(prior)), mean(prior) .+ 3 * sqrt.(var(prior))]
+        pci_constrained = [transform_unconstrained_to_constrained(prior, prior_conf_interval[i]) for i in 1:2]
+        pcic = [(pci_constrained[1][i], pci_constrained[2][i]) for i in 1:length(pci_constrained[1])]
+        pcic_batched = [pcic[b][1] for b in batch(prior)]
+        @info("EKI Optimization result:")
+        println(
+            display(
+                [
+                    "name" "number of hyperparameters" "optimized value range" "99% prior mass"
+                    names length.(hp_optimal_batch) hp_optimal_range pcic_batched
+                ],
+            ),
+        )
+    end
     rfm_tmp = RFM_from_hyperparameters(
         vrfi,
         rng,