Merge #42

42: Normalize GP input r=charleskawczynski a=bielim Normalize inputs to the GP training and prediction. Normalization is done by centering the inputs (i.e., subtracting their mean) and multiplying them by the square root of the inverse of the input covariance. Whether or not the GP is trained on normalized inputs can be specified with an optional input argument to GPObj ("normalized"), which defaults to true. If the GP has been trained on normalized inputs, the "predict" function automatically applies the same same normalization when predicting on new inputs. The idea is that normalization will make the GP hyperparameters more independent of the problem - e.g., the length scales used for the default kernels can be assumed to be reasonable defaults for many problems. Co-authored-by: Charles Kawczynski <kawczynski.charles@gmail.com>
CliMA · Mar 2, 2020 · 35d3feb · 35d3feb
2 parents f85852c + 0d22d17
commit 35d3feb
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 95 deletions.
diff --git a/src/GPEmulator.jl b/src/GPEmulator.jl
@@ -50,32 +50,47 @@ models.
 # Fields
 $(DocStringExtensions.FIELDS)
 """
-struct GPObj{FT<:AbstractFloat,GPM}
-    "training points/parameters; N_parameters x N_samples"
+struct GPObj{FT<:AbstractFloat, GPM}
+    "training inputs/parameters; N_samples x N_parameters"
     inputs::Array{FT, 2}
-    "output data (`outputs = G(inputs)`); N_parameters x N_samples"
+    "training data/targets; N_samples x N_data"
     data::Array{FT, 2}
-    "the Gaussian Process Regression model(s) that are fitted to the given input-output pairs"
+    "mean of input; 1 x N_parameters"
+    input_mean::Array{FT, 2}
+    "square root of the inverse of the input covariance matrix; N_parameters x N_parameters"
+    sqrt_inv_input_cov::Array{FT, 2}
+    "the Gaussian Process (GP) Regression model(s) that are fitted to the given input-data pairs"
     models::Vector
-    "Data prediction type"
-    prediction_type::Union{Nothing,PredictionType}
+    "whether to fit GP models on normalized inputs ((inputs - input_mean) * sqrt_inv_input_cov)"
+    normalized::Bool
+    "prediction type (`y` to predict the data, `f` to predict the latent function)"
+    prediction_type::Union{Nothing, PredictionType}
 end
 
+
 """
-    GPObj(inputs, data, package::GPJL, GPkernel::Union{K, KPy, Nothing}=nothing; prediction_type::PredictionType=YType()) where {K<:Kernel, KPy<:PyObject}
+    GPObj(inputs, data, package::GPJL; GPkernel::Union{K, KPy, Nothing}=nothing, normalized::Bool=true, prediction_type::PredictionType=YType()) where {K<:Kernel, KPy<:PyObject}
 
 Inputs and data of size N_samples x N_parameters (both arrays will be transposed in the construction of the GPObj)
 
  - `GPkernel` - GaussianProcesses kernel object. If not supplied, a default
                 kernel is used. The default kernel is the sum of a Squared
                 Exponential kernel and white noise.
 """
-function GPObj(inputs, data, package::GPJL, GPkernel::Union{K, KPy, Nothing}=nothing; prediction_type::PredictionType=YType()) where {K<:Kernel, KPy<:PyObject}
+function GPObj(inputs, data, package::GPJL; GPkernel::Union{K, KPy, Nothing}=nothing, normalized::Bool=true, prediction_type::PredictionType=YType()) where {K<:Kernel, KPy<:PyObject}
     FT = eltype(data)
     models = Any[]
-    outputs = convert(Array{FT}, data')
-    inputs = convert(Array{FT}, inputs')
 
+    # Make sure inputs and data are arrays of type FT
+    inputs = convert(Array{FT}, inputs)
+    data = convert(Array{FT}, data)
+
+    input_mean = reshape(mean(inputs, dims=1), 1, :)
+    sqrt_inv_input_cov = convert(Array{FT}, sqrt(inv(cov(inputs, dims=1))))
+    if normalized
+        inputs = (inputs .- input_mean) * sqrt_inv_input_cov
+    end
+
     # Use a default kernel unless a kernel was supplied to GPObj
     if GPkernel==nothing
         # Construct kernel:
@@ -91,36 +106,49 @@ function GPObj(inputs, data, package::GPJL, GPkernel::Union{K, KPy, Nothing}=not
         GPkernel = rbf + white
     end
 
-    for i in 1:size(outputs, 1)
+    for i in 1:size(data, 2)
         # Make a copy of GPkernel (because the kernel gets altered in
         # every iteration)
         GPkernel_i = deepcopy(GPkernel)
-        # inputs: N_param x N_samples
-        # outputs: N_data x N_samples
+        # inputs: N_samples x N_parameters
+        # data: N_samples x N_data
         logstd_obs_noise = log(sqrt(0.5)) # log standard dev of obs noise
         # Zero mean function
         kmean = MeanZero()
-        m = GPE(inputs, outputs[i, :], kmean, GPkernel_i, logstd_obs_noise)
+        m = GPE(inputs', dropdims(data[:, i]', dims=1), kmean, GPkernel_i, 
+                logstd_obs_noise)
         optimize!(m, noise=false)
         push!(models, m)
     end
-    return GPObj{FT, typeof(package)}(inputs, outputs, models, prediction_type)
+    return GPObj{FT, typeof(package)}(inputs, data, input_mean, 
+                                      sqrt_inv_input_cov, models, normalized,
+                                      prediction_type)
 end
 
+
 """
-    GPObj(inputs, data, package::SKLJL, GPkernel::Union{K, KPy, Nothing}=nothing) where {K<:Kernel, KPy<:PyObject}
+    GPObj(inputs, data, package::SKLJL, GPkernel::Union{K, KPy, Nothing}=nothing, normalized::Bool=true) where {K<:Kernel, KPy<:PyObject}
 
 Inputs and data of size N_samples x N_parameters (both arrays will be transposed in the construction of the GPObj)
 
- - `GPkernel` - GaussianProcesses kernel object. If not supplied, a default
-                kernel is used. The default kernel is the sum of a Squared
-                Exponential kernel and white noise.
+ - `GPkernel` - GaussianProcesses or ScikitLearn kernel object. If not supplied, 
+                a default kernel is used. The default kernel is the sum of a 
+                Squared Exponential kernel and white noise.
 """
-function GPObj(inputs, data, package::SKLJL, GPkernel::Union{K, KPy, Nothing}=nothing) where {K<:Kernel, KPy<:PyObject}
+function GPObj(inputs, data, package::SKLJL; GPkernel::Union{K, KPy, Nothing}=nothing, normalized::Bool=true) where {K<:Kernel, KPy<:PyObject}
     FT = eltype(data)
     models = Any[]
-    outputs = convert(Array{FT}, data')
+
+    # Make sure inputs and data are arrays of type FT
     inputs = convert(Array{FT}, inputs)
+    data = convert(Array{FT}, data)
+
+    input_mean = reshape(mean(inputs, dims=1), 1, :)
+    sqrt_inv_input_cov = convert(Array{FT}, sqrt(inv(cov(inputs, dims=1))))
+    if normalized
+        inputs = (inputs .- input_mean) * sqrt_inv_input_cov
+    end
+
     if GPkernel==nothing
         const_value = 1.0
         var_kern = ConstantKernel(constant_value=const_value,
@@ -132,48 +160,69 @@ function GPObj(inputs, data, package::SKLJL, GPkernel::Union{K, KPy, Nothing}=no
                             noise_level_bounds=(1e-05, 10.0))
         GPkernel = var_kern * rbf + white
     end
-    for i in 1:size(outputs,1)
-        out = reshape(outputs[i,:], (size(outputs, 2), 1))
+    for i in 1:size(data, 2)
         m = GaussianProcessRegressor(kernel=GPkernel,
                                      n_restarts_optimizer=10,
                                      alpha=0.0, normalize_y=true)
-        ScikitLearn.fit!(m, inputs, out)
+        ScikitLearn.fit!(m, inputs, data[:, i])
         if i==1
             println(m.kernel.hyperparameters)
             print("Completed training of: ")
         end
         print(i,", ")
         push!(models, m)
     end
-    return GPObj{FT, typeof(package)}(inputs, outputs, models, nothing)
+    return GPObj{FT, typeof(package)}(inputs, data, input_mean, sqrt_inv_input_cov, 
+                                      models, normalized, YType())
 end
 
 
-predict(gp::GPObj{FT,GPJL}, new_inputs::Array{FT}) where {FT} = predict(gp, new_inputs, gp.prediction_type)
+"""
+  predict(gp::GPObj{FT,GPJL}, new_inputs::Array{FT}) where {FT} = predict(gp, new_inputs, gp.prediction_type)
+
+Evaluate the GP model(s) at new inputs.
+  - `gp` - a GPObj
+  - `new_inputs` - inputs for which GP model(s) is/are evaluated; N_new_inputs x N_parameters
+
+Note: If gp.normalized == true, the new inputs are normalized prior to the prediction
+"""
+predict(gp::GPObj{FT, GPJL}, new_inputs::Array{FT}) where {FT} = predict(gp, new_inputs, gp.prediction_type)
 
-function predict(gp::GPObj{FT,GPJL},
+function predict(gp::GPObj{FT, GPJL},
                  new_inputs::Array{FT},
                  ::FType) where {FT}
+    if gp.normalized
+        new_inputs = (new_inputs .- gp.input_mean) * gp.sqrt_inv_input_cov
+    end
     M = length(gp.models)
     μσ2 = [predict_f(gp.models[i], new_inputs) for i in 1:M]
-    return first.(μσ2), last.(μσ2)
+    # Return mean(s) and standard deviation(s)
+    return vcat(first.(μσ2)...), vcat(last.(μσ2)...)
 end
-function predict(gp::GPObj{FT,GPJL},
+
+function predict(gp::GPObj{FT, GPJL},
                  new_inputs::Array{FT},
                  ::YType) where {FT}
+    if gp.normalized
+        new_inputs = (new_inputs .- gp.input_mean) * gp.sqrt_inv_input_cov
+    end
     M = length(gp.models)
     # predicts columns of inputs so must be transposed
     new_inputs = convert(Array{FT}, new_inputs')
     μσ2 = [predict_y(gp.models[i], new_inputs) for i in 1:M]
-    return first.(μσ2), last.(μσ2)
+    # Return mean(s) and standard deviation(s)
+    return vcat(first.(μσ2)...), vcat(last.(μσ2)...)
 end
 
-function predict(gp::GPObj{FT,SKLJL},
+function predict(gp::GPObj{FT, SKLJL},
                  new_inputs::Array{FT}) where {FT}
+    if gp.normalized
+        new_inputs = (new_inputs .- gp.input_mean) * gp.sqrt_inv_input_cov
+    end
     M = length(gp.models)
     μσ = [gp.models[i].predict(new_inputs, return_std=true) for i in 1:M]
-    mean = first.(μσ)
-    return first.(μσ), last.(μσ).^2
+    # Return mean(s) and standard deviation(s)
+    return vcat(first.(μσ)...), vcat(last.(μσ)...).^2
 end
 
 end # module
diff --git a/src/MCMC.jl b/src/MCMC.jl
@@ -54,15 +54,6 @@ struct MCMCObj{FT<:AbstractFloat, IT<:Int}
     accept::Array{IT}
     "MCMC algorithm to use - currently implemented: 'rmw' (random walk Metropolis)"
     algtype::String
-    "whether or not the GP Emulator has been trained on
-     standardized (i.e., z scores of) parameters - if true,
-     the evaluation of the prior at the current parameter value
-     is done after transforming the parameters back to their
-     original value (i.e., multiply by standard deviation and
-     add mean).
-     TODO: Hopefully FreeParams will be able to deal with
-     standardized parameters in a more elegant way."
-    standardized::Bool
 end
 
 """
@@ -74,7 +65,6 @@ end
             max_iter::IT,
             algtype::String,
             burnin::IT,
-            standardized::Bool) where {FT<:AbstractFloat, IT<:Int}
 
 where max_iter is the number of MCMC steps to perform (e.g., 100_000)
 
@@ -86,8 +76,7 @@ function MCMCObj(obs_sample::Vector{FT},
                  param_init::Vector{FT},
                  max_iter::IT,
                  algtype::String,
-                 burnin::IT,
-                 standardized::Bool) where {FT<:AbstractFloat, IT<:Int}
+                 burnin::IT) where {FT<:AbstractFloat, IT<:Int}
 
     # first row is param_init
     posterior = zeros(max_iter + 1, length(param_init))
@@ -102,11 +91,10 @@ function MCMCObj(obs_sample::Vector{FT},
         sys.exit()
     end
     MCMCObj{FT,IT}(obs_sample, obs_cov, obs_covinv, priors, [step], burnin,
-                  param, posterior, log_posterior, iter, accept, algtype,
-                  standardized)
-
+                   param, posterior, log_posterior, iter, accept, algtype)
 end
 
+
 function reset_with_step!(mcmc::MCMCObj{FT}, step::FT) where {FT}
     # reset to beginning with new stepsize
     mcmc.step[1] = step
@@ -174,15 +162,8 @@ function log_prior(mcmc::MCMCObj{FT}) where {FT}
     # Assume independent priors for each parameter
     priors = mcmc.prior
     for (param, prior_dist) in zip(mcmc.param, priors)
-        if mcmc.standardized
-            param_std = std(prior_dist)
-            param_mean = mean(prior_dist)
-            # get density at current parameter value
-            log_rho[1] += logpdf(prior_dist, param*param_std + param_mean)
-        else
-            # get density at current parameter value
-            log_rho[1] += logpdf(prior_dist, param)
-        end
+        # get density at current parameter value
+        log_rho[1] += logpdf(prior_dist, param)
     end
 
     return log_rho[1]
@@ -229,10 +210,7 @@ function find_mcmc_step!(mcmc_test::MCMCObj{FT}, gpobj::GPObj{FT}) where {FT}
     while mcmc_accept == false
 
         param = convert(Array{FT, 2}, mcmc_test.param')
-        # test predictions param' is 1xN_params
         gp_pred, gp_predvar = predict(gpobj, param)
-        gp_pred = cat(gp_pred..., dims=2)
-        gp_predvar = cat(gp_predvar..., dims=2)
 
         mcmc_sample!(mcmc_test, vec(gp_pred), vec(gp_predvar))
         it += 1

diff --git a/test/Cloudy/runtests.jl b/test/Cloudy/runtests.jl
@@ -35,9 +35,9 @@ N0_true = 360.
 θ_true = 4.1
 k_true = 2.0
 u_true = [N0_true, θ_true, k_true]
-priors = [Distributions.Normal(300., 50.),   # prior on N0
-          Distributions.Normal(6.0, 2.0),    # prior on θ
-          Distributions.Normal(3.0, 1.5)]    # prior on k
+priors = [Distributions.Uniform(100.0, 600.0), # prior on N0
+          Distributions.Uniform(0.0, 12.0),    # prior on θ
+          Distributions.Uniform(0.0, 6.0)]     # prior on k 
 
 # We assume that the true particle mass distribution is a Gamma distribution
 # with parameters N0_true, θ_true, k_true. Note that dist_true has to be a
@@ -106,8 +106,7 @@ for i in 1:N_iter
                                   g_settings,
                                   PDistributions.update_params,
                                   PDistributions.moment,
-                                  Cloudy.Sources.get_int_coalescence
-                                  )
+                                  Cloudy.Sources.get_int_coalescence)
     EKI.update_ensemble!(ekiobj, g_ens)
 end
 
@@ -123,6 +122,7 @@ println(mean(exp_transform(ekiobj.u[end]), dims=1))
 ###
 
 gppackage = GPJL() # use the GaussianProcesses.jl package
+pred_type = YType() # predict the data, not the latent function
 
 # Construct kernel:
 # Sum kernel consisting of Matern 5/2 ARD kernel, a Squared Exponential Iso
@@ -140,26 +140,15 @@ GPkernel =  kern1 + kern2 + white
 log_u_tp, g_tp = Utilities.extract_GP_tp(ekiobj, 5)
 u_tp = exp_transform(log_u_tp)
 
-# Standardize the parameters
-param_mean = [mean(prior) for prior in priors]
-param_std = [std(prior) for prior in priors]
-u_tp_zscore = Utilities.orig2zscore(u_tp, param_mean, param_std)
-# Standardize the data
-data_mean = vec(mean(g_tp, dims=1))
-data_std = vec(std(g_tp, dims=1))
-g_tp_zscore = Utilities.orig2zscore(g_tp, data_mean, data_std)
-
 # Fit a Gaussian Process regression to the training points
-gpobj = GPEmulator.GPObj(u_tp_zscore, g_tp_zscore, gppackage, GPkernel)
+gpobj = GPEmulator.GPObj(u_tp, g_tp, gppackage; GPkernel=GPkernel, 
+                         normalized=true, prediction_type=pred_type)
 
 # Check how well the Gaussian Process regression predicts on the
 # (standardized) true parameters
-u_true_zscore = Utilities.orig2zscore(u_true, param_mean, param_std)
-y_mean, y_var = GPEmulator.predict(gpobj, reshape(u_true_zscore, 1, :))
-y_mean = cat(y_mean..., dims=2)
-y_var = cat(y_var..., dims=2)
+y_mean, y_var = GPEmulator.predict(gpobj, reshape(u_true, 1, :))
 println("GP prediction on true parameters: ")
-println(vec(y_mean) .* data_std + data_mean)
+println(y_mean)
 println("true data: ")
 println(truth.mean)
 
@@ -176,26 +165,19 @@ mcmc_alg = "rwm" # random walk Metropolis
 burnin = 0
 step = 0.1 # first guess
 max_iter = 5000
-standardized = true # we are using z scores
 yt_sample = Utilities.get_obs_sample(truth)
-yt_sample_zscore = Utilities.orig2zscore(yt_sample, data_mean, data_std)
-# Normalize covariance of obs noise to determinant 1
-truth_cov_norm = (1.0/(det(truth.cov)))^(1.0/n_param) * truth.cov
-
-mcmc_test = MCMC.MCMCObj(yt_sample_zscore, truth_cov_norm,
-                         priors, step, u0,
-                         max_iter, mcmc_alg, burnin, standardized)
+mcmc_test = MCMC.MCMCObj(yt_sample, truth.cov, priors, step, u0, max_iter, 
+                         mcmc_alg, burnin)
 new_step = MCMC.find_mcmc_step!(mcmc_test, gpobj)
 
 # Now begin the actual MCMC
 println("Begin MCMC - with step size ", new_step)
 u0 = vec(mean(u_tp, dims=1))
 # reset parameters
 burnin = 1000
-max_iter = 100000
-mcmc = MCMC.MCMCObj(yt_sample_zscore, truth_cov_norm, priors,
-                    new_step, u0, max_iter, mcmc_alg, burnin,
-                    standardized)
+max_iter = 500_000
+mcmc = MCMC.MCMCObj(yt_sample, truth.cov, priors, new_step, u0, max_iter, 
+                        mcmc_alg, burnin)
 MCMC.sample_posterior!(mcmc, gpobj, max_iter)
 
 posterior = MCMC.get_posterior(mcmc)
@@ -211,7 +193,7 @@ println("post_cov")
 for idx in 1:n_param
     if idx == 1
         param = "N0"
-        xs = collect(100:1:500)
+        xs = collect(100:1:600)
     elseif idx == 2
         param = "Theta"
         xs = collect(0:0.01:12.0)
@@ -223,8 +205,8 @@ for idx in 1:n_param
     end
 
     label = "true " * param
-    histogram(posterior[:, idx] .* param_std[idx] .+ param_mean[idx],
-              bins=100, normed=true, fill=:slategray, lab="posterior")
+    histogram(posterior[:, idx], bins=100, normed=true, fill=:slategray, 
+              lab="posterior")
     plot!(xs, mcmc.prior[idx], w=2.6, color=:blue, lab="prior")
     plot!([u_true[idx]], seriestype="vline", w=2.6, lab=label)