Merge #111

111: Regularization & standardization r=mhowlan3 a=mhowlan3 1. Log-posterior calculation in MCMC 2. SVD regularization via truncation 3. Re-normalization (standardization) of output space Co-authored-by: mhowlan3 <mike.howland13@gmail.com>
CliMA · Apr 16, 2021 · 8723d4a · 8723d4a
2 parents 871b166 + abc2dec
commit 8723d4a
Show file tree

Hide file tree

Showing 12 changed files with 375 additions and 436 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,5 @@ docs/site/
 *.jl.mem
 *.vscode*
 *.DS_Store
+*.jld2
+output*
diff --git a/docs/make.jl b/docs/make.jl
@@ -12,11 +12,13 @@ api = ["CalibrateEmulateSample" => [
 				 "Utilities" => "API/Utilities.md"
          ],
 ]
-
+
+examples = ["Lorenz example" => "examples/lorenz_example.md"]
 
 pages = [
     "Home" => "index.md",
     "Installation instructions" => "installation_instructions.md",
+    "Examples" => examples,
     "API" => api,
 ]
 

diff --git a/docs/src/examples/lorenz_example.md b/docs/src/examples/lorenz_example.md
@@ -0,0 +1,26 @@
+# Lorenz 96 example
+
+We provide the following template for how the tools may be applied.
+
+For small examples typically have 2 files.
+
+- `GModel.jl` Contains the forward map. The inputs should be the so-called free parameters we are interested in learning, and the output should be the measured data
+- The example script which contains the inverse problem setup and solve
+
+## The structure of the example script
+First we create the data and the setting for the model
+1. Set up the forward model.
+2. Construct/load the truth data. Store this data conveniently in the `Observations.Obs` object
+
+Then we set up the inverse problem
+3. Define the prior distributions. Use the `ParameterDistribution` object
+4. Decide on which `process` tool you would like to use (we recommend you begin with `Invesion()`). Then initialize this with the relevant constructor
+5. initialize the `EnsembleKalmanProcess` object
+
+Then we solve the inverse problem, in a loop perform the following for as many iterations as required:
+7. Obtain the current parameter ensemble
+8. Transform them from the unbounded computational space to the physical space
+9. call the forward map on the ensemble of parameters, producing an ensemble of measured data
+10. call the `update_ensemble!` function to generate a new parameter ensemble based on the new data
+
+One can then obtain the solution, dependent on the `process` type.
diff --git a/examples/Lorenz/GModel.jl b/examples/Lorenz/GModel.jl
@@ -5,7 +5,8 @@ using DocStringExtensions
 using Random
 using Distributions
 using LinearAlgebra
-using DifferentialEquations
+#using DifferentialEquations
+#using OrdinaryDiffEq
 using FFTW
 
 export run_G

diff --git a/examples/Lorenz/Lorenz_example.jl b/examples/Lorenz/Lorenz_example.jl
@@ -23,9 +23,27 @@ using CalibrateEmulateSample.ParameterDistributionStorage
 using CalibrateEmulateSample.DataStorage
 using CalibrateEmulateSample.Observations
 
-rng_seed = 4137
+#rng_seed = 4137
+#rng_seed = 413798
+rng_seed = 2413798
 Random.seed!(rng_seed)
 
+
+function get_standardizing_factors(data::Array{FT,2}) where {FT}
+    # Input: data size: N_data x N_ensembles
+    # Ensemble median of the data
+    norm_factor = median(data,dims=2)
+    return norm_factor
+end
+
+function get_standardizing_factors(data::Array{FT,1}) where {FT}
+    # Input: data size: N_data*N_ensembles (splatted)
+    # Ensemble median of the data
+    norm_factor = median(data)
+    return norm_factor
+end
+
+
 # Output figure save directory
 example_directory = @__DIR__
 println(example_directory)
@@ -45,7 +63,7 @@ end
 dynamics = 2 # Transient is 2
 # Statistics integration length
 # This has to be less than 360 and 360 must be divisible by Ts_days
-Ts_days = 90. # Integration length in days
+Ts_days = 30. # Integration length in days
 # Stats type, which statistics to construct from the L96 system
 # 4 is a linear fit over a batch of length Ts_days
 # 5 is the mean over a batch of length Ts_days
@@ -89,8 +107,9 @@ end
 #logmean_A, logstd_A = logmean_and_logstd(A_true, 0.2*A_true)
 
 if dynamics == 2
-	prior_means = [F_true+1.0, A_true+0.5]
-	prior_stds = [2.0, 0.5*A_true]
+	#prior_means = [F_true+0.5, A_true+0.5]
+	prior_means = [F_true, A_true]
+	prior_stds = [3.0, 0.5*A_true]
 	d1 = Parameterized(Normal(prior_means[1], prior_stds[1]))
 	d2 = Parameterized(Normal(prior_means[2], prior_stds[2]))
 	prior_distns = [d1, d2]
@@ -124,11 +143,12 @@ dt = 1/64.
 # Start of integration
 t_start = 800.
 # Data collection length
-if dynamics==1
-    T = 2.
-else
-    T = 360. / τc
-end
+#if dynamics==1
+#    T = 2.
+#else
+#    T = 360. / τc
+#end
+T = 360. / τc
 # Batch length
 Ts = 5. / τc # Nondimensionalize by L96 timescale
 # Integration length
@@ -162,6 +182,7 @@ lorenz_params = GModel.LParams(F_true, ω_true, A_true)
 gt = dropdims(GModel.run_G_ensemble(params_true, lorenz_settings), dims=2)
 
 # Compute internal variability covariance
+n_samples = 50 
 if var_prescribe==true
     n_samples = 100
     yt = zeros(length(gt),n_samples)
@@ -174,7 +195,6 @@ if var_prescribe==true
     end
 else
     println("Using truth values to compute covariance")
-    n_samples = 20 
     yt = zeros(length(gt), n_samples)
     for i in 1:n_samples
 	    lorenz_settings_local = GModel.LSettings(dynamics, stats_type, t_start+T*(i-1), 
@@ -188,12 +208,19 @@ else
     Γy = cov(yt, dims=2)
 
     println(Γy)
+    println(size(Γy))
+    println(rank(Γy))
 end
 
 
 # Construct observation object
 truth = Observations.Obs(yt, Γy, data_names)
+# Truth sample for EKP
 truth_sample = truth.mean
+#sample_ind=randperm!(collect(1:n_samples))[1]
+#truth_sample = yt[:,sample_ind]
+#println("Truth sample:")
+#println(sample_ind)
 ###
 ###  Calibrate: Ensemble Kalman Inversion
 ###
@@ -219,6 +246,7 @@ ekiobj = EnsembleKalmanProcessModule.EnsembleKalmanProcess(initial_params,
 # EKI iterations
 println("EKP inversion error:")
 err = zeros(N_iter) 
+err_params = zeros(N_iter) 
 for i in 1:N_iter
     if log_normal==false
         params_i = get_u_final(ekiobj)
@@ -227,8 +255,10 @@ for i in 1:N_iter
     end
     g_ens = GModel.run_G_ensemble(params_i, lorenz_settings_G)
     EnsembleKalmanProcessModule.update_ensemble!(ekiobj, g_ens) 
-    err[i] = get_error(ekiobj)[end] #mean((params_true - mean(params_i,dims=2)).^2)
-    println("Iteration: "*string(i)*", Error: "*string(err[i]))
+    err[i] = get_error(ekiobj)[end] 
+    err_params[i] = mean((params_true - mean(params_i,dims=2)).^2)
+    println("Iteration: "*string(i)*", Error (data): "*string(err[i]))
+    println("Iteration: "*string(i)*", Error (params): "*string(err_params[i]))
 end
 
 # EKI results: Has the ensemble collapsed toward the truth?
@@ -246,37 +276,60 @@ end
 ###  Emulate: Gaussian Process Regression
 ###
 
+# Emulate-sample settings
+standardize = true
+truncate_svd = 0.95
+
 gppackage = GaussianProcessEmulator.GPJL()
 pred_type = GaussianProcessEmulator.YType()
 
+# Standardize the output data
+# Use median over all data since all data are the same type
+yt_norm = vcat(yt...)
+norm_factor = get_standardizing_factors(yt_norm) 
+println(size(norm_factor))
+#norm_factor = vcat(norm_factor...)
+norm_factor = fill(norm_factor, size(truth_sample))
+println("Standardization factors")
+println(norm_factor)
+
 # Get training points from the EKP iteration number in the second input term  
+N_iter = 5;
 input_output_pairs = Utilities.get_training_points(ekiobj, N_iter)
 normalized = true
 
 # Default kernel
 gpobj = GaussianProcessEmulator.GaussianProcess(input_output_pairs, gppackage; 
 			 obs_noise_cov=Γy, normalized=normalized,
-                         noise_learn=false, prediction_type=pred_type)
+                         noise_learn=false, 
+			 truncate_svd=truncate_svd, standardize=standardize,
+			 prediction_type=pred_type,
+			 norm_factor=norm_factor)
 
 # Check how well the Gaussian Process regression predicts on the
 # true parameters
-if log_normal==false
-    y_mean, y_var = GaussianProcessEmulator.predict(gpobj, reshape(params_true, :, 1), 
-                                   transform_to_real=true)
-else
-	y_mean, y_var = GaussianProcessEmulator.predict(gpobj, reshape(log.(params_true), :, 1), 
-                                   transform_to_real=true)
-end
-
-println("GP prediction on true parameters: ")
-println(vec(y_mean))
-println(" GP variance")
-println(diag(y_var[1],0))
-println("true data: ")
-println(truth.mean)
-println("GP MSE: ")
-println(mean((truth.mean - vec(y_mean)).^2))
-
+#if truncate_svd==1.0
+    if log_normal==false
+        y_mean, y_var = GaussianProcessEmulator.predict(gpobj, reshape(params_true, :, 1), 
+                                       transform_to_real=true)
+    else
+    	y_mean, y_var = GaussianProcessEmulator.predict(gpobj, reshape(log.(params_true), :, 1), 
+                                       transform_to_real=true)
+    end
+
+    println("GP prediction on true parameters: ")
+    println(vec(y_mean))
+    println(" GP variance")
+    println(diag(y_var[1],0))
+    println("true data: ")
+    if standardize
+        println(truth.mean ./ norm_factor)
+    else
+        println(truth.mean)
+    end
+    println("GP MSE: ")
+    println(mean((truth.mean - vec(y_mean)).^2))
+#end
 ###
 ###  Sample: Markov Chain Monte Carlo
 ###
@@ -287,14 +340,17 @@ println("initial parameters: ", u0)
 
 # MCMC parameters    
 mcmc_alg = "rwm" # random walk Metropolis
+svd_flag = true
 
 # First let's run a short chain to determine a good step size
 burnin = 0
 step = 0.1 # first guess
 max_iter = 2000
 yt_sample = truth_sample
 mcmc_test = MarkovChainMonteCarlo.MCMC(yt_sample, Γy, priors, step, u0, max_iter, 
-                         mcmc_alg, burnin, svdflag=true)
+                         mcmc_alg, burnin;
+                         svdflag=svd_flag, standardize=standardize, truncate_svd=truncate_svd,
+			 norm_factor=norm_factor)
 new_step = MarkovChainMonteCarlo.find_mcmc_step!(mcmc_test, gpobj, max_iter=max_iter)
 
 # Now begin the actual MCMC
@@ -305,7 +361,9 @@ burnin = 2000
 max_iter = 100000
 
 mcmc = MarkovChainMonteCarlo.MCMC(yt_sample, Γy, priors, new_step, u0, max_iter, 
-                    mcmc_alg, burnin, svdflag=true)
+                    mcmc_alg, burnin;
+                    svdflag=svd_flag, standardize=standardize, truncate_svd=truncate_svd,
+		    norm_factor=norm_factor)
 MarkovChainMonteCarlo.sample_posterior!(mcmc, gpobj, max_iter)
 
 println("Posterior size")
@@ -335,11 +393,11 @@ save_directory = figure_save_directory*y_folder
 for idx in 1:n_params
     if idx == 1
         param = "F"
-	xbounds = [7.95, 8.05]
+	xbounds = [F_true-1.0, F_true+1.0]
 	xs = collect(xbounds[1]:(xbounds[2]-xbounds[1])/100:xbounds[2])
     elseif idx == 2
         param = "A"
-	xbounds = [2.45, 2.55]
+	xbounds = [A_true-1.0, A_true+1.0]
 	xs = collect(xbounds[1]:(xbounds[2]-xbounds[1])/100:xbounds[2])
     elseif idx == 3
         param = "ω"