Adding large verification example

libqueso · Aug 2, 2017 · f50e7fa · f50e7fa
1 parent 3d5c1af
commit f50e7fa
Show file tree

Hide file tree

Showing 8 changed files with 10,085 additions and 88 deletions.
diff --git a/examples/Makefile.am b/examples/Makefile.am
@@ -446,18 +446,7 @@ dist_gpmsa_vector_DATA += gp/scalar/ctf_dat.txt
 
 gpmsa_linverifdir = $(prefix)/examples/gp/linear_verif
 
-gpmsa_linverif_PROGRAMS =
-gpmsa_linverif_PROGRAMS += gpmsa_linverif
-
-gpmsa_linverif_SOURCES = gp/linear_verif/gpmsa_linear_verif.C
-gpmsa_linverif_LDADD = $(top_builddir)/src/libqueso.la
-gpmsa_linverif_CPPFLAGS = -I$(top_srcdir)/examples/gp/linear_verif $(AM_CPPFLAGS)
-
 dist_gpmsa_linverif_DATA =
-dist_gpmsa_linverif_DATA += ${gpmsa_linverif_SOURCES}
-dist_gpmsa_linverif_DATA += gp/linear_verif/gpmsa_scalar.txt
-dist_gpmsa_linverif_DATA += gp/linear_verif/sim_scalar.dat
-dist_gpmsa_linverif_DATA += gp/linear_verif/y_exp_scalar.txt
 dist_gpmsa_linverif_DATA += gp/linear_verif/gpmsa_mv.txt
 dist_gpmsa_linverif_DATA += gp/linear_verif/sim_mv.dat
 dist_gpmsa_linverif_DATA += gp/linear_verif/y_exp_mv.txt

diff --git a/test/Makefile.am b/test/Makefile.am
@@ -78,6 +78,7 @@ check_PROGRAMS += test_custom_tk_am
 check_PROGRAMS += test_no_initial_point
 check_PROGRAMS += test_parallel_h5
 check_PROGRAMS += test_gpmsa_scalar_pdf_small
+check_PROGRAMS += test_gpmsa_scalar_pdf_large
 
 LDADD       = $(top_builddir)/src/libqueso.la
 
@@ -260,6 +261,7 @@ test_no_initial_point_SOURCES = test_StatisticalInverseProblem/test_no_initial_p
 test_parallel_h5_SOURCES = test_StatisticalInverseProblem/test_parallel_h5.C
 
 test_gpmsa_scalar_pdf_small_SOURCES = test_gpmsa/scalar_pdf_small.C
+test_gpmsa_scalar_pdf_large_SOURCES = test_gpmsa/scalar_pdf_large.C
 
 TESTS =
 TESTS += unit_driver
@@ -341,6 +343,7 @@ TESTS += test_custom_tk_am
 TESTS += test_no_initial_point
 TESTS += test_StatisticalInverseProblem/test_parallel_h5.sh
 TESTS += test_gpmsa/scalar_pdf_small.sh
+TESTS += test_gpmsa/scalar_pdf_large.sh
 
 if ! MPI_ENABLED
 XFAIL_TESTS = test_SequenceOfVectors/test_unifiedPositionsOfMaximum.sh
@@ -416,6 +419,11 @@ EXTRA_DIST += test_gpmsa/sim_scalar_small.dat
 EXTRA_DIST += test_gpmsa/y_exp_scalar_small.txt
 EXTRA_DIST += test_gpmsa/regression_solution_pdf_small.txt
 EXTRA_DIST += test_gpmsa/scalar_pdf_small.sh
+EXTRA_DIST += test_gpmsa/gpmsa_scalar_pdf_large_input.txt
+EXTRA_DIST += test_gpmsa/sim_scalar_large.dat
+EXTRA_DIST += test_gpmsa/y_exp_scalar_large.txt
+EXTRA_DIST += test_gpmsa/regression_solution_pdf_large.txt
+EXTRA_DIST += test_gpmsa/scalar_pdf_large.sh
 
 CLEANFILES =
 CLEANFILES += test_Environment/debug_output_sub0.txt

diff --git a/examples/gp/linear_verif/gpmsa_scalar.txt → ...st_gpmsa/gpmsa_scalar_pdf_large_input.txt b/examples/gp/linear_verif/gpmsa_scalar.txt → ...st_gpmsa/gpmsa_scalar_pdf_large_input.txt
diff --git a/test/test_gpmsa/regression_solution_pdf_large.txt b/test/test_gpmsa/regression_solution_pdf_large.txt
diff --git a/...ples/gp/linear_verif/gpmsa_linear_verif.C → test/test_gpmsa/scalar_pdf_large.C b/...ples/gp/linear_verif/gpmsa_linear_verif.C → test/test_gpmsa/scalar_pdf_large.C
@@ -42,6 +42,8 @@
 
 #include <cstdio>
 
+#define TOL 1e-5
+
 void open_data_file(const std::string& data_filename, std::ifstream& data_stream) {
   std::string example_dir = "gp/linear_verify";
   data_stream.open(example_dir + "/" + data_filename);
@@ -132,6 +134,7 @@ void readData
     unsigned int block_offset = i * num_responses;
     for (unsigned int j = 0; j < num_responses; ++j) {
       (*(experimentOutputs[i]))[j] -= meansim[j];
+
       (*(experimentOutputs[i]))[j] /= stdsim[j];
       for (unsigned int k = 0; k < num_responses; ++k)
 	experimentMat(block_offset + j, block_offset + k) /=
@@ -182,7 +185,6 @@ void run_scalar(const QUESO::FullEnvironment& env)
   // x1      0.9   1.1
   // x2     -1.5   1.0
   // x3     -1.5   0.5
-
   QUESO::VectorSpace<QUESO::GslVector, QUESO::GslMatrix> configSpace(env,
       "scenario_", numConfigVars, NULL);
 
@@ -197,22 +199,11 @@ void run_scalar(const QUESO::FullEnvironment& env)
       "experimentspace_", experimentSize * numExperiments, NULL);
 
   // Step 5: Instantiate the Gaussian process emulator object
-  //
-  // Regarding simulation scenario input values, the user should standardise
-  // them so that they exist inside a hypercube.
-  //
-  // Regarding simulation output data, the user should transform it so that the
-  // mean is zero and the variance is one.
-  //
-  // Regarding experimental scenario input values, the user should standardize
-  // them so that they exist inside a hypercube.
-  //
-  // Regarding experimental data, the user should transformed it so that it has
-  // zero mean and variance one.
 
-  // GPMSA stores all the information about our simulation
-  // data and experimental data.  It also stores default information about the
-  // hyperparameter distributions.
+  // GPMSA stores all the information about our simulation data and experimental
+  // data, and if the users opts to, will normalise this to have zero mean and
+  // unit variance.  It also stores default information about the hyperparameter
+  // distributions.
   QUESO::GPMSAFactory<QUESO::GslVector, QUESO::GslMatrix>
     gpmsaFactory(env,
                  NULL,
@@ -229,10 +220,6 @@ void run_scalar(const QUESO::FullEnvironment& env)
   // Must set scaling before adding experiments due to construct order
   // As of this implementation autoscale only affects params/scenarios
 
-  // TODO: Ask Brian Williams about preferred scaling and probably set
-  // to scale to domain bounds instead of data bounds...
-  gp_opts.set_autoscale_minmax();
-
   // std::vector containing all the points in scenario space where we have
   // simulations
   std::vector<QUESO::SharedPtr<QUESO::GslVector>::Type>
@@ -265,6 +252,12 @@ void run_scalar(const QUESO::FullEnvironment& env)
     simulationScenarios[i].reset(new QUESO::GslVector(configSpace.zeroVector()));  // 'x_{i+1}^*' in paper
     paramVecs          [i].reset(new QUESO::GslVector(paramSpace.zeroVector()));  // 't_{i+1}^*' in paper
     outputVecs         [i].reset(new QUESO::GslVector(nEtaSpace.zeroVector()));  // 'eta_{i+1}' in paper
+
+    // Must set scaling before adding experiments due to construct order
+    // As of this implementation autoscale only affects params/scenarios
+    gp_opts.set_autoscale_minmax_uncertain_parameter(i);
+    gp_opts.set_autoscale_minmax_scenario_parameter(i);
+    gp_opts.set_autoscale_meanvar_output(i);
   }
 
   for (unsigned int i = 0; i < numExperiments; i++) {
@@ -280,78 +273,80 @@ void run_scalar(const QUESO::FullEnvironment& env)
 
   // Read in data and store the standard deviation of the simulation
   // data (ignored for now).
-  //double stdsim =
-  readData("sim_scalar.dat", "y_exp_scalar.txt", simulationScenarios, paramVecs,
-	   outputVecs, experimentScenarios, experimentVecs, *experimentMat);
+  const char * test_srcdir = std::getenv("srcdir");
+
+  std::string simInputFileName = "test_gpmsa/sim_scalar_large.dat";
+  std::string expInputFileName = "test_gpmsa/y_exp_scalar_large.txt";
+  std::string solInputFileName = "test_gpmsa/regression_solution_pdf_large.txt";
+
+  if (test_srcdir) {
+    simInputFileName = test_srcdir + ('/' + simInputFileName);
+    expInputFileName = test_srcdir + ('/' + expInputFileName);
+    solInputFileName = test_srcdir + ('/' + solInputFileName);
+  }
+
+  readData(simInputFileName,
+           expInputFileName,
+           simulationScenarios,
+           paramVecs,
+           outputVecs,
+           experimentScenarios,
+           experimentVecs,
+           *experimentMat);
 
   // Add simulation and experimental data
   gpmsaFactory.addSimulations(simulationScenarios, paramVecs, outputVecs);
   gpmsaFactory.addExperiments(experimentScenarios, experimentVecs, experimentMat);
 
-  QUESO::GenericVectorRV<QUESO::GslVector, QUESO::GslMatrix> postRv(
-      "post_",
-      gpmsaFactory.prior().imageSet().vectorSpace());
-  QUESO::StatisticalInverseProblem<QUESO::GslVector, QUESO::GslMatrix> ip("",
-      NULL, gpmsaFactory, postRv);
-
-  QUESO::GslVector paramInitials(
+  QUESO::GslVector point(
       gpmsaFactory.prior().imageSet().vectorSpace().zeroVector());
 
-  // Start with the mean of the prior
-  gpmsaFactory.prior().pdf().distributionMean(paramInitials);
-
-  // Initial condition of the chain: may need to tweak to Brian's expectations
-  std::cout << "\nPrior-based initial position:\n" << paramInitials << std::endl;
-
-  // Brian Williams' recommended initial point
-  paramInitials[0]  = 0.175;    // beta0
-  paramInitials[1]  = -0.3;     // beta1
-  paramInitials[2]  = 0.1;      // beta2
-  // [ truncation error precision ] (truncated SVD case only)
-  // BJW: max(100, shape * scale)
-  paramInitials[3]  = 1.0;               // emul precision
-  paramInitials[4]  = std::exp(-0.025);  // emul corr strength (scenario/beta?)
-  paramInitials[5]  = std::exp(-0.025);  // emul corr strength (scenario/beta?)
-  paramInitials[6]  = std::exp(-0.025);  // emul corr strength (scenario/beta?)
-  paramInitials[7]  = std::exp(-0.025);  // emul corr strength (scenario/beta?)
-  paramInitials[8]  = std::exp(-0.025);  // emul corr strength (scenario/beta?)
-  paramInitials[9]  = std::exp(-0.025);  // emul corr strength (scenario/beta?)
-  paramInitials[10] = 20.0;              // discrepancy precision
-  paramInitials[11] = std::exp(-0.025);  // discrepancy corr strength x1
-  paramInitials[12] = std::exp(-0.025);  // discrepancy corr strength x2
-  paramInitials[13] = std::exp(-0.025);  // discrepancy corr strength x3
-  paramInitials[14] = 1000.0;            // emul data precision
-  if (gpmsaFactory.options().m_calibrateObservationalPrecision)
-    // BJW: max(20, shape * scale)
-    paramInitials[15] = 20.0;            // obs precision
+  unsigned int num_lines = 10000;
+  std::vector<double> expected_log_likelihoods(num_lines);
+  std::vector<double> expected_log_priors(num_lines);
+  std::vector<double> expected_log_posteriors(num_lines);
+  std::vector<double> computed_log_likelihoods(num_lines);
+  std::vector<double> computed_log_priors(num_lines);
+  std::vector<double> computed_log_posteriors(num_lines);
 
-  std::cout << "\nAdjusted initial position:\n" << paramInitials << std::endl;
-
-  QUESO::GslMatrix proposalCovMatrix(
-      gpmsaFactory.prior().imageSet().vectorSpace().zeroVector());
+  // File containing x, theta, y (vertical concatenation of lhs.txt, y_mod.txt)
+  std::ifstream solution_data;
+  open_data_file(solInputFileName, solution_data);
+  for (unsigned int i = 0; i < num_lines; i++) {
+    for (unsigned int j = 0; j < point.sizeLocal(); ++j)
+      solution_data >> point[j];
 
-  // Start with the covariance matrix for the whole prior, including
-  // GPMSA hyper-parameters.
-  gpmsaFactory.prior().pdf().distributionVariance(proposalCovMatrix);
+    double log_pdf;
+    solution_data >> log_pdf;
+    expected_log_likelihoods[i] = log_pdf;
 
-  std::cout << "\nPrior proposal covariance diagonal:\n";
-  for (unsigned int i=0; i<proposalCovMatrix.numCols(); ++i)
-    std::cout << proposalCovMatrix(i,i) << " ";
-  std::cout << std::endl;
+    solution_data >> log_pdf;
+    expected_log_priors[i] = log_pdf;
 
-  // FIXME: The default doesn't seem to work for this case; fudge it:
-  proposalCovMatrix(10, 10) = 1.0e2;
-  proposalCovMatrix(14, 14) = 1.0e1;
+    solution_data >> log_pdf;
+    expected_log_posteriors[i] = log_pdf;
 
-  std::cout << "\nFinal proposal covariance diagonal:\n";
-  for (unsigned int i=0; i<proposalCovMatrix.numCols(); ++i)
-    std::cout << proposalCovMatrix(i,i) << " ";
-  std::cout << std::endl;
+    log_pdf = (gpmsaFactory.prior().pdf().lnValue(point));
+    computed_log_priors[i] = log_pdf;
 
-  std::cout << "\nFinal GPMSA Options:" << gpmsaFactory.options() << std::endl;
+    log_pdf = (gpmsaFactory.getGPMSAEmulator().lnValue(point, NULL, NULL, NULL, NULL));
+    computed_log_likelihoods[i] = log_pdf;
+  }
+  solution_data.close();
 
-  ip.solveWithBayesMetropolisHastings(NULL, paramInitials, &proposalCovMatrix);
+  double initial_diff_prior = expected_log_priors[0] - computed_log_priors[0];
+  for (unsigned int i = 0; i < expected_log_priors.size(); i++) {
+    double diff_prior = expected_log_priors[i] - computed_log_priors[i] - initial_diff_prior;
+    queso_require_less_equal_msg(std::abs(diff_prior), TOL, "computed log prior differs too much from expected");
+  }
 
+  // We don't subtract off the initial because the normalisation constants
+  // between our implementation and Matlab's implementation appear to be the
+  // same
+  for (unsigned int i = 0; i < expected_log_likelihoods.size(); i++) {
+    double diff_likelihood = expected_log_likelihoods[i] - computed_log_likelihoods[i];
+    queso_require_less_equal_msg(std::abs(diff_likelihood), TOL, "computed log likelihood differs too much from expected");
+  }
 }
 
 

diff --git a/test/test_gpmsa/scalar_pdf_large.sh b/test/test_gpmsa/scalar_pdf_large.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set -eu
+set -o pipefail
+
+./test_gpmsa_scalar_pdf_large ${srcdir}/test_gpmsa/gpmsa_scalar_pdf_large_input.txt
diff --git a/examples/gp/linear_verif/sim_scalar.dat → test/test_gpmsa/sim_scalar_large.dat b/examples/gp/linear_verif/sim_scalar.dat → test/test_gpmsa/sim_scalar_large.dat
diff --git a/examples/gp/linear_verif/y_exp_scalar.txt → test/test_gpmsa/y_exp_scalar_large.txt b/examples/gp/linear_verif/y_exp_scalar.txt → test/test_gpmsa/y_exp_scalar_large.txt