From 6a24b9000da6e4348ce470f0bb40b45e67c033f2 Mon Sep 17 00:00:00 2001
From: Rob Trangucci <robert.trangucci@gmail.com>
Date: Thu, 3 Aug 2017 14:48:25 -0400
Subject: [PATCH 1/3] Fixes to GP chapter

---
 src/docs/stan-reference/examples.tex | 284 +++++++++++++++------------
 1 file changed, 154 insertions(+), 130 deletions(-)

diff --git a/src/docs/stan-reference/examples.tex b/src/docs/stan-reference/examples.tex
index ecc58c2c25f..cfb3ad5f020 100644
--- a/src/docs/stan-reference/examples.tex
+++ b/src/docs/stan-reference/examples.tex
@@ -6980,17 +6980,16 @@ \section{Simulating from a Gaussian Process}
   real x[N];
 }
 transformed data {
-  vector[N] mu;
   matrix[N, N] K;
-  mu = rep_vector(0, N);
+  vector[N] mu = rep_vector(0, N);
   for (i in 1:(N - 1)) {
-    K[i,i] = 1 + 0.1;
+    K[i, i] = 1 + 0.1;
     for (j in (i + 1):N) {
-      K[i, j] = exp(-1/2 * pow(x[i] - x[j], 2));
+      K[i, j] = exp(-0.5 * pow(x[i] - x[j],2));
       K[j, i] = K[i, j];
     }
   }
-  K[N,N] = 1 + 0.1;
+  K[N, N] = 1 + 0.1;
 }
 parameters {
   vector[N] y;
@@ -7009,12 +7008,10 @@ \section{Simulating from a Gaussian Process}
   real x[N];
 }
 transformed data {
-  vector[N] mu;
-  cov_matrix[N] K;
-  mu = rep_vector(0, N);
-  K = cov_exp_quad(x, 1, 1);
-  for (i in 1:N)
-    K[i, i] = K[i, i] + 0.1;
+  matrix[N, N] K = cov_exp_quad(x, 1.0, 1.0);
+  vector[N] mu = rep_vector(0, N);
+  for (n in 1:N) 
+    K[n, n] = K[n, n] + 0.1;
 }
 parameters {
   vector[N] y;
@@ -7041,8 +7038,8 @@ \subsection{Multivariate Inputs}
 %
 \begin{stancode}
 data {
-  int<lower=1> D;
   int<lower=1> N;
+  int<lower=1> D;
   vector[D] x[N];
 }
 transformed data {
@@ -7062,17 +7059,17 @@ \subsection{Cholesky Factored and Transformed Implementation}
 
 A more efficient implementation of the simulation model can be
 coded in Stan by relocating, rescaling and rotating an isotropic unit
-normal variate.  Suppose $z$ is an an isotropic unit normal variate
+normal variate.  Suppose $\eta$ is an an isotropic unit normal variate
 \[
-z \sim \distro{Normal}({\bf 0}, {\bf 1}),
+\eta \sim \distro{Normal}({\bf 0}, {\bf 1}),
 \]
 where ${\bf 0}$ is an $N$-vector of 0 values and ${\bf 1}$ is the $N
 \times N$ identity matrix.  Let $L$ be the Cholesky decomposition of
 $K(x | \theta)$, i.e., the lower-triangular matrix $L$ such that $LL^{\top} =
-K(x | \theta)$.  Then the transformed variable $\mu + Lz$ has the intended
+K(x | \theta)$.  Then the transformed variable $\mu + L\eta$ has the intended
 target distribution,
 \[
-  \mu + Lz \sim \distro{MultiNormal}(\mu(x), K(x | \theta)).
+  \mu + L\eta \sim \distro{MultiNormal}(\mu(x), K(x | \theta)).
 \]
 
 This transform can be applied directly to Gaussian process
@@ -7096,22 +7093,22 @@ \subsection{Cholesky Factored and Transformed Implementation}
   L = cholesky_decompose(K);
 }
 parameters {
-  vector[N] z;
+  vector[N] eta;
 }
 model {
-  z ~ normal(0, 1);
+  eta ~ normal(0, 1);
 }
 generated quantities {
   vector[N] y;
-  y = mu + L * z;
+  y = mu + L * eta;
 }
 \end{stancode}
 %
 The Cholesky decomposition is only computed once, after the data is
 loaded and the covariance matrix \code{K} computed.  The isotropic
-normal distribution for \code{z} is specified as a vectorized
+normal distribution for \code{eta} is specified as a vectorized
 univariate distribution for efficiency; this specifies that each
-\code{z[n]} has an independent unit normal distribution.  The sampled
+\code{eta[n]} has an independent unit normal distribution.  The sampled
 vector \code{y} is then defined as a generated quantity using a direct
 encoding of the transform described above.
 
@@ -7123,7 +7120,7 @@ \subsection{GP with a normal outcome}
 $y \in \R^N$, with inputs $x \in \R^N$, for a finite $N$:
 
 \begin{align*}
-  \rho & \sim \distro{Gamma}(4,4) \\
+  \rho & \sim \distro{InvGamma}(5, 5) \\
   \alpha & \sim \distro{Normal}(0, 1) \\
   \sigma & \sim \distro{Normal}(0, 1) \\
   f & \sim \distro{MultiNormal}\left(0, K(x | \alpha, \rho)\right) \\
@@ -7134,7 +7131,7 @@ \subsection{GP with a normal outcome}
 process $f$, yielding the more parsimonious model:
 
 \begin{align*}
-  \rho & \sim \distro{Gamma}(4,4) \\
+  \rho & \sim \distro{InvGamma}(5, 5) \\
   \alpha & \sim \distro{Normal}(0, 1) \\
   \sigma & \sim \distro{Normal}(0, 1) \\
   y & \sim \distro{MultiNormal}
@@ -7174,8 +7171,7 @@ \subsection{GP with a normal outcome}
   vector[N] y;
 }
 transformed data {
-  vector[N] mu;
-  mu = rep_vector(0, N);
+  vector[N] mu = rep_vector(0, N);
 }
 parameters {
   real<lower=0> rho;
@@ -7183,17 +7179,17 @@ \subsection{GP with a normal outcome}
   real<lower=0> sigma;
 }
 model {
-  matrix[N, N] K = cov_exp_quad(x, alpha, rho);
   matrix[N, N] L_K;
+  matrix[N, N] K = cov_exp_quad(x, alpha, rho);
   real sq_sigma = square(sigma);
 
   // diagonal elements
   for (n in 1:N)
     K[n, n] = K[n, n] + sq_sigma;
-
+  
   L_K = cholesky_decompose(K);
-
-  rho ~ gamma(4, 4);
+  
+  rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
   sigma ~ normal(0, 1);
 
@@ -7251,18 +7247,18 @@ \subsubsection{Latent variable GP}
 model {
   vector[N] f;
   {
-    matrix[N, N] K = cov_exp_quad(x, alpha, rho);
     matrix[N, N] L_K;
-
+    matrix[N, N] K = cov_exp_quad(x, alpha, rho);
+  
     // diagonal elements
-    for (k in 1:N)
-      K[k, k] = K[k, k] + delta;
-
+    for (n in 1:N)
+      K[n, n] = K[n, n] + delta;
+    
     L_K = cholesky_decompose(K);
     f = L_K * eta;
   }
-
-  rho ~ gamma(4, 4);
+  
+  rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
   sigma ~ normal(0, 1);
   eta ~ normal(0, 1);
@@ -7306,17 +7302,17 @@ \subsubsection{Poisson GP}
 }
 ...
 parameters {
-  real<lower=0> alpha;
   real<lower=0> rho;
+  real<lower=0> alpha;
   real a;
   vector[N] eta;
 }
 model {
 ...
+  rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
-  rho ~ gamma(4, 4);
-  eta ~ normal(0, 1);
   a ~ normal(0, 1);
+  eta ~ normal(0, 1);
 
   y ~ poisson_log(a + f);
 }
@@ -7392,24 +7388,23 @@ \subsection{Automatic Relevance Determination}
 %
 \begin{stancode}
 functions {
-  matrix L_cov_exp_quad_ARD(real[] x,
+  matrix L_cov_exp_quad_ARD(vector[] x,
                             real alpha,
-                            vector[] rho,
+                            vector rho,
                             real delta) {
     int N = size(x);
     matrix[N, N] K;
-    real inv_half = -1/2;
+    real neg_half = -0.5;
     real sq_alpha = square(alpha);
-    for (i in 1:(N - 1)) {
-      K[i,i] = sq_alpha + delta;
+    for (i in 1:(N-1)) {
+      K[i, i] = sq_alpha + delta;
       for (j in (i + 1):N) {
-        K[i, j] = sq_alpha
-                      * exp(inv_half
-                            * dot_self((x[i] - x[j]) ./ rho));
-        K[j, i] = K[i, j];
+        K[i, j] = sq_alpha * exp(neg_half * 
+                                 dot_self((x[i] - x[j]) ./ rho));
+        K[j, i] = K[i, j]; 
       }
     }
-    K[N,N] = sq_alpha + delta;
+    K[N, N] = sq_alpha + delta;
     return cholesky_decompose(K);
   }
 }
@@ -7420,8 +7415,7 @@ \subsection{Automatic Relevance Determination}
   vector[N] y;
 }
 transformed data {
-  real delta;
-  delta = 1e-9;
+  real delta = 1e-9;
 }
 parameters {
   vector<lower=0>[D] rho;
@@ -7432,11 +7426,11 @@ \subsection{Automatic Relevance Determination}
 model {
   vector[N] f;
   {
-    matrix[N, N] L_K = L_cov_exp_quad_ARD(x, alpha, rho, 1e-9);
+    matrix[N, N] L_K = L_cov_exp_quad_ARD(x, alpha, rho, delta);
     f = L_K * eta;
   }
-
-  rho ~ gamma(4, 4);
+  
+  rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
   sigma ~ normal(0, 1);
   eta ~ normal(0, 1);
@@ -7485,7 +7479,11 @@ \subsubsection{Priors for length-scale}
 functions, so our prior should put non-negligible mass on both sets of
 functions. In this case, an inverse gamma, \code{inv\_gamma\_lpdf} in Stan's
 language, will work well as it has a sharp left tail that puts negligible mass
-on length-scales, but a generous right tail, allowing for large length-scales.
+on infinitesimal length-scales, but a generous right tail, allowing for large
+length-scales. Inverse gamma priors are zero-avoiding because the density is
+zero at zero, so the posterior for length-scale will be pushed away from zero,
+leading to very little posterior mass on functions that perfectly interpolate
+the observed data.
 
 If we're using the GP as a component in a larger model that includes an overall
 mean and fixed effects for the same variables we're using as the domain for the
@@ -7514,9 +7512,9 @@ \subsubsection{Priors for length-scale}
 	& x, a, b \in \reals^{+}, \, p \in \mathbb{Z}
 \end{align*}
 %
-which has an inverse gamma left tail if $p \leq 0$ and a Gaussian right tail.
-This has not yet been implemented in Stan's math library, but it is possible to
-implement as a user defined function:
+which has an inverse gamma left tail if $p \leq 0$ and an inverse Gaussian
+right tail.  This has not yet been implemented in Stan's math library, but it
+is possible to implement as a user defined function:
 \begin{stancode}
 functions {
   real generalized_inverse_gaussian_lpdf(real x, int p,
@@ -7584,36 +7582,44 @@ \subsection{Predictive Inference with a Gaussian Process}
   real x2[N2];
 }
 transformed data {
+  real delta = 1e-9;
   int<lower=1> N = N1 + N2;
-  vector[N] x;
-  for (n in 1:N1) x[n] = x1[n];
-  for (n in 1:N2) x[N1 + n] = x2[n];
+  real x[N];
+  for (n1 in 1:N1) x[n1] = x1[n1];
+  for (n2 in 1:N2) x[N1 + n2] = x2[n2];
 }
 parameters {
-  real<lower=0> alpha;
   real<lower=0> rho;
+  real<lower=0> alpha;
   real<lower=0> sigma;
   vector[N] eta;
 }
-model {
+transformed parameters {
   vector[N] f;
   {
+    matrix[N, N] L_K;
     matrix[N, N] K = cov_exp_quad(x, alpha, rho);
-    matrix[N, N] L_K = cholesky_decompose(K);
+  
+    // diagonal elements
+    for (n in 1:N)
+      K[n, n] = K[n, n] + delta;
+    
+    L_K = cholesky_decompose(K);
     f = L_K * eta;
   }
-
+}
+model {
+  rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
-  rho ~ gamma(4, 4);
   sigma ~ normal(0, 1);
   eta ~ normal(0, 1);
 
-  y ~ normal(f[1:N1], sigma);
+  y1 ~ normal(f[1:N1], sigma);
 }
 generated quantities {
   vector[N2] y2;
-  for (n in 1:N2)
-    y2[n] = normal_rng(f[N1 + n], sigma);
+  for (n2 in 1:N2)
+    y2[n2] = normal_rng(f[N1 + n2], sigma);
 }
 \end{stancode}
 %
@@ -7656,43 +7662,49 @@ \subsubsection{Predictive Inference in non-Gaussian GPs}
 data {
   int<lower=1> N1;
   real x1[N1];
-  vector[N1] z1;
+  int<lower=0, upper=1> z1[N1];
   int<lower=1> N2;
   real x2[N2];
 }
 transformed data {
-  int<lower=1> N = N1 + N2;
   real delta = 1e-9;
-  vector[N] x;
-  for (n in 1:N1) x[n] = x1[n];
-  for (n in 1:N2) x[N1 + n] = x2[n];
+  int<lower=1> N = N1 + N2;
+  real x[N];
+  for (n1 in 1:N1) x[n1] = x1[n1];
+  for (n2 in 1:N2) x[N1 + n2] = x2[n2];
 }
 parameters {
-  real<lower=0> alpha;
   real<lower=0> rho;
+  real<lower=0> alpha;
+  real a;
   vector[N] eta;
 }
-model {
+transformed parameters {
   vector[N] f;
   {
-    matrix[N, N] K = cov_exp_quad(x, alpha, rho);
     matrix[N, N] L_K;
+    matrix[N, N] K = cov_exp_quad(x, alpha, rho);
+  
+    // diagonal elements
     for (n in 1:N)
       K[n, n] = K[n, n] + delta;
+    
     L_K = cholesky_decompose(K);
     f = L_K * eta;
   }
-
+}
+model {
+  rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
-  rho ~ gamma(4, 4);
+  a ~ normal(0, 1);
   eta ~ normal(0, 1);
 
-  y ~ bernoulli_logit(f[1:N1]);
+  z1 ~ bernoulli_logit(a + f[1:N1]);
 }
 generated quantities {
   int z2[N2];
-  for (n in 1:N2)
-    z2[n] = bernoulli_logit_rng(f[N1 + n]);
+  for (n2 in 1:N2)
+    z2[n2] = bernoulli_logit_rng(a + f[N1 + n2]);
 }
 \end{stancode}
 %
@@ -7736,41 +7748,41 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
 the conditional mean and the conditional covariance of $p(\tilde{y})$.
 
 \begin{stancode}
-functions {
-  vector gp_pred_rng(real[] x2,
-                     vector y1,
-                     real[] x1,
-                     real alpha,
-                     real rho,
-                     real sigma,
-                     real delta) {
-    int N1 = rows(y1);
-    int N2 = size(x2);
-    vector[N2] f2;
-
-    {
-      matrix[N1, N1] L_K;
-      vector[N1] K_div_y1;
-      matrix[N1, N2] k_x1_x2;
-      matrix[N1, N2] v_pred;
-      vector[N2] f2_mu;
-      matrix[N2, N2] cov_f2;
-      matrix[N2, N2] diag_delta;
-      matrix[N1, N1] K;
-      K = cov_exp_quad(x1, alpha, rho);
-      for (n in 1:N1)
-        K[n, n] = K[n,n] + square(sigma);
-      L_K = cholesky_decompose(K);
-      K_div_y1 = mdivide_left_tri_low(L_K, y1);
-      K_div_y1 = mdivide_right_tri_low(K_div_y1',L_K)';
-      k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho);
-      f2_mu = (k_x1_x2' * K_div_y1);
-      v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
-      cov_f2 = cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred;
-      diag_delta = diag_matrix(rep_vector(delta,N2));
-
-      f2 = multi_normal_rng(f2_mu, cov_f2 + diag_delta);
-    }
+functions{
+    vector gp_pred_rng(real[] x2,
+                       vector y1,
+                       real[] x1,
+                       real alpha,
+                       real rho,
+                       real sigma,
+                       real delta) {
+      int N1 = rows(y1);
+      int N2 = size(x2);
+      vector[N2] f2;
+  
+      {
+        matrix[N1, N1] L_K;
+        vector[N1] K_div_y1;
+        matrix[N1, N2] k_x1_x2;
+        matrix[N1, N2] v_pred;
+        vector[N2] f2_mu;
+        matrix[N2, N2] cov_f2;
+        matrix[N2, N2] diag_delta;
+        matrix[N1, N1] K;
+        K = cov_exp_quad(x1, alpha, rho);
+        for (n in 1:N1)
+          K[n, n] = K[n,n] + square(sigma);
+        L_K = cholesky_decompose(K);
+        K_div_y1 = mdivide_left_tri_low(L_K, y1);
+        K_div_y1 = mdivide_right_tri_low(K_div_y1',L_K)';
+        k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho);
+        f2_mu = (k_x1_x2' * K_div_y1); 
+        v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
+        cov_f2 = cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred;
+        diag_delta = diag_matrix(rep_vector(delta,N2));
+  
+        f2 = multi_normal_rng(f2_mu, cov_f2 + diag_delta);
+      }
     return f2;
   }
 }
@@ -7783,29 +7795,37 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
 }
 transformed data {
   vector[N1] mu = rep_vector(0, N1);
+  real delta = 1e-9;
 }
 parameters {
-  real<lower=0> alpha;
   real<lower=0> rho;
+  real<lower=0> alpha;
   real<lower=0> sigma;
 }
 model {
   matrix[N1, N1] L_K;
   {
-    matrix[N1, N1] K = cov_exp_quad(x, alpha, rho);
+    matrix[N1, N1] K = cov_exp_quad(x1, alpha, rho);
     real sq_sigma = square(sigma);
+  
+    // diagonal elements
     for (n1 in 1:N1)
       K[n1, n1] = K[n1, n1] + sq_sigma;
+    
     L_K = cholesky_decompose(K);
   }
+  
+  rho ~ inv_gamma(5, 5);
+  alpha ~ normal(0, 1);
+  sigma ~ normal(0, 1);
 
-  y ~ multi_normal_cholesky(mu, L_K);
+  y1 ~ multi_normal_cholesky(mu, L_K);
 }
 generated quantities {
   vector[N2] f2;
   vector[N2] y2;
 
-  f2 = gp_pred_rng(x2, y1, x1, alpha, rho, sigma);
+  f2 = gp_pred_rng(x2, y1, x1, alpha, rho, sigma, delta);
   for (n2 in 1:N2)
     y2[n2] = normal_rng(f2[n2], sigma);
 }
@@ -7872,36 +7892,36 @@ \subsection{Multiple-output Gaussian processes}
 \begin{stancode}
 data {
   int<lower=1> N;
-  int<lower=1> M;
+  int<lower=1> D;
   real x[N];
-  matrix[N, M] y;
+  matrix[N, D] y;
 }
 transformed data {
   real delta = 1e-9;
 }
 parameters {
   real<lower=0> rho;
-  vector<lower=0>[M] alpha;
+  vector<lower=0>[D] alpha;
   real<lower=0> sigma;
-  cholesky_factor_corr[M] L_Omega;
-  matrix[N, M] eta;
+  cholesky_factor_corr[D] L_Omega;
+  matrix[N, D] eta;
 }
 model {
-  matrix[N, M] f;
+  matrix[N, D] f;
   {
     matrix[N, N] K = cov_exp_quad(x, 1.0, rho);
     matrix[N, N] L_K;
 
     // diagonal elements
-    for (k in 1:N)
-      K[k, k] = K[k, k] + delta;
+    for (n in 1:N)
+      K[n, n] = K[n, n] + delta; 
 
     L_K = cholesky_decompose(K);
     f = L_K * eta
         * diag_pre_multiply(alpha, L_Omega)';
   }
 
-  rho ~ gamma(4, 4);
+  rho ~ inv_gamma(5, 5);
   alpha ~ normal(0, 1);
   sigma ~ normal(0, 1);
   L_Omega ~ lkj_corr_cholesky(3);
@@ -7909,6 +7929,10 @@ \subsection{Multiple-output Gaussian processes}
 
   to_vector(y) ~ normal(to_vector(f), sigma);
 }
+generated quantities {
+  matrix[D, D] Omega;
+  Omega = L_Omega * L_Omega';
+}
 \end{stancode}
 
 \chapter{Directions, Rotations, and Hyperspheres}

From 8557b4676484e56f8736d70f123bcb9ec88abf9e Mon Sep 17 00:00:00 2001
From: Rob Trangucci <robert.trangucci@gmail.com>
Date: Thu, 3 Aug 2017 15:05:47 -0400
Subject: [PATCH 2/3] Added cross-ref to zero-avoiding prior section

---
 src/docs/stan-reference/examples.tex | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/docs/stan-reference/examples.tex b/src/docs/stan-reference/examples.tex
index cfb3ad5f020..405bd2b5c07 100644
--- a/src/docs/stan-reference/examples.tex
+++ b/src/docs/stan-reference/examples.tex
@@ -1061,7 +1061,8 @@ \section{Hierarchical Priors}\label{hierarchical-priors.section}
 overly sensitive either statistically or computationally to wide tails
 in the priors.
 
-\subsection{Boundary-Avoiding Priors for MLE in Hierarchical Models}
+\subsection{Boundary-Avoiding Priors for MLE in Hierarchical
+Models}\label{bound-avoid-priors.subsection}
 
 The fundamental problem with maximum likelihood estimation (MLE) in
 the hierarchical model setting is that as the hierarchical variance
@@ -7480,10 +7481,11 @@ \subsubsection{Priors for length-scale}
 functions. In this case, an inverse gamma, \code{inv\_gamma\_lpdf} in Stan's
 language, will work well as it has a sharp left tail that puts negligible mass
 on infinitesimal length-scales, but a generous right tail, allowing for large
-length-scales. Inverse gamma priors are zero-avoiding because the density is
-zero at zero, so the posterior for length-scale will be pushed away from zero,
-leading to very little posterior mass on functions that perfectly interpolate
-the observed data.
+length-scales. Inverse gamma priors will avoid infinitesimal length-scales
+because the density is zero at zero, so the posterior for length-scale will be
+pushed away from zero. An inverse gamma distribution is one of many
+zero-avoiding or boundary-avoiding distributions. See
+\ref{bound-avoid-priors.subsection} for more on boundary-avoiding priors.
 
 If we're using the GP as a component in a larger model that includes an overall
 mean and fixed effects for the same variables we're using as the domain for the

From d5c52b8924d2b41f6386705e27a1135d24f3cb67 Mon Sep 17 00:00:00 2001
From: Rob Trangucci <robert.trangucci@gmail.com>
Date: Tue, 15 Aug 2017 15:54:21 -0400
Subject: [PATCH 3/3] Spacing problems fixed

---
 src/docs/stan-reference/examples.tex | 95 ++++++++++++++--------------
 1 file changed, 47 insertions(+), 48 deletions(-)

diff --git a/src/docs/stan-reference/examples.tex b/src/docs/stan-reference/examples.tex
index dd2e22fe7bf..22eb989d476 100644
--- a/src/docs/stan-reference/examples.tex
+++ b/src/docs/stan-reference/examples.tex
@@ -7482,10 +7482,10 @@ \subsubsection{Priors for length-scale}
 GP, i.e.:
 %
 \begin{align*}
-	f & \sim \distro{MultiNormal}\left(0, K(x | \alpha, \rho)\right) \\ y_i &
-	\sim \distro{Normal}(\beta_0 + x_i \beta_{[1:D]} + f_i, \sigma) \, \forall i
-	\in \{1, \dots, N\} \\ & x_i^T, \beta_{[1:D]} \in \reals^D,\, x \in \reals^{N
-	\times D},\, f \in \reals^N
+  f & \sim \distro{MultiNormal}\left(0, K(x | \alpha, \rho)\right) \\ y_i &
+  \sim \distro{Normal}(\beta_0 + x_i \beta_{[1:D]} + f_i, \sigma) \, \forall i
+  \in \{1, \dots, N\} \\ & x_i^T, \beta_{[1:D]} \in \reals^D,\, x \in \reals^{N
+  \times D},\, f \in \reals^N
 \end{align*}
 %
 we'll likely want to constrain large length-scales as well.  A length scale
@@ -7499,9 +7499,9 @@ \subsubsection{Priors for length-scale}
 Gaussian distribution:
 %
 \begin{align*}
-	f(x | a, b, p) & = \dfrac{(a/b)^{p/2}}{2K_p(\sqrt{ab})} x^{p - 1}\exp(-(ax + b
-	/ x)/2) \\
-	& x, a, b \in \reals^{+}, \, p \in \mathbb{Z}
+  f(x | a, b, p) & = \dfrac{(a/b)^{p/2}}{2K_p(\sqrt{ab})} x^{p - 1}\exp(-(ax + b
+  / x)/2) \\
+  & x, a, b \in \reals^{+}, \, p \in \mathbb{Z}
 \end{align*}
 %
 which has an inverse gamma left tail if $p \leq 0$ and an inverse Gaussian
@@ -7740,41 +7740,40 @@ \subsubsection{Analytical Form of Joint Predictive Inference}
 the conditional mean and the conditional covariance of $p(\tilde{y})$.
 
 \begin{stancode}
-functions{
-    vector gp_pred_rng(real[] x2,
-                       vector y1,
-                       real[] x1,
-                       real alpha,
-                       real rho,
-                       real sigma,
-                       real delta) {
-      int N1 = rows(y1);
-      int N2 = size(x2);
-      vector[N2] f2;
-  
-      {
-        matrix[N1, N1] L_K;
-        vector[N1] K_div_y1;
-        matrix[N1, N2] k_x1_x2;
-        matrix[N1, N2] v_pred;
-        vector[N2] f2_mu;
-        matrix[N2, N2] cov_f2;
-        matrix[N2, N2] diag_delta;
-        matrix[N1, N1] K;
-        K = cov_exp_quad(x1, alpha, rho);
-        for (n in 1:N1)
-          K[n, n] = K[n,n] + square(sigma);
-        L_K = cholesky_decompose(K);
-        K_div_y1 = mdivide_left_tri_low(L_K, y1);
-        K_div_y1 = mdivide_right_tri_low(K_div_y1',L_K)';
-        k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho);
-        f2_mu = (k_x1_x2' * K_div_y1); 
-        v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
-        cov_f2 = cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred;
-        diag_delta = diag_matrix(rep_vector(delta,N2));
-  
-        f2 = multi_normal_rng(f2_mu, cov_f2 + diag_delta);
-      }
+functions {
+  vector gp_pred_rng(real[] x2,
+                     vector y1,
+                     real[] x1,
+                     real alpha,
+                     real rho,
+                     real sigma,
+                     real delta) {
+    int N1 = rows(y1);
+    int N2 = size(x2);
+    vector[N2] f2;
+    {
+      matrix[N1, N1] L_K;
+      vector[N1] K_div_y1;
+      matrix[N1, N2] k_x1_x2;
+      matrix[N1, N2] v_pred;
+      vector[N2] f2_mu;
+      matrix[N2, N2] cov_f2;
+      matrix[N2, N2] diag_delta;
+      matrix[N1, N1] K;
+      K = cov_exp_quad(x1, alpha, rho);
+      for (n in 1:N1)
+        K[n, n] = K[n,n] + square(sigma);
+      L_K = cholesky_decompose(K);
+      K_div_y1 = mdivide_left_tri_low(L_K, y1);
+      K_div_y1 = mdivide_right_tri_low(K_div_y1',L_K)';
+      k_x1_x2 = cov_exp_quad(x1, x2, alpha, rho);
+      f2_mu = (k_x1_x2' * K_div_y1); 
+      v_pred = mdivide_left_tri_low(L_K, k_x1_x2);
+      cov_f2 = cov_exp_quad(x2, alpha, rho) - v_pred' * v_pred;
+      diag_delta = diag_matrix(rep_vector(delta,N2));
+
+      f2 = multi_normal_rng(f2_mu, cov_f2 + diag_delta);
+    }
     return f2;
   }
 }
@@ -7855,7 +7854,7 @@ \subsection{Multiple-output Gaussian processes}
 \begin{align*} f_{[n,]} \sim \distro{MultiNormal}(m(x)_{[n,]}, K(x | \alpha,
 \rho)_{[n,n]} C(\phi)) \end{align*} and that the columns of the matrix $f$ are
 distributed: \begin{align*} f_{[,m]} \sim \distro{MultiNormal}(m(x)_{[,m]}, K(x
-	| \alpha, \rho) C(\phi)_{[m,m]}) \end{align*}
+  | \alpha, \rho) C(\phi)_{[m,m]}) \end{align*}
 This also means means that $\E\left[f^T f\right]$ is equal to
 $\text{trace}(K(x | \alpha, \rho)) \times C$, whereas $\E\left[ff^T\right]$
 is $\text{trace}(C) \times K(x | \alpha, \rho)$. We can derive this using
@@ -7869,11 +7868,11 @@ \subsection{Multiple-output Gaussian processes}
 $\reals^{N \times M}$ using the following algorithm:
 %
 \begin{align*}
-	\eta_{i,j} & \sim \distro{Normal}(0, 1) \, \forall i,j \\
-	f & = L_{K(x | 1.0, \rho)} \, \eta \, L_C(\phi)^T \\
-	f & \sim \distro{MatrixNormal}(0, K(x | 1.0, \rho), C(\phi)) \\
-	\eta & \in \reals^{N \times M} \\
-	L_C(\phi) & = \text{cholesky\_decompose}(C(\phi)) \\
+  \eta_{i,j} & \sim \distro{Normal}(0, 1) \, \forall i,j \\
+  f & = L_{K(x | 1.0, \rho)} \, \eta \, L_C(\phi)^T \\
+  f & \sim \distro{MatrixNormal}(0, K(x | 1.0, \rho), C(\phi)) \\
+  \eta & \in \reals^{N \times M} \\
+  L_C(\phi) & = \text{cholesky\_decompose}(C(\phi)) \\
   L_{K(x | 1.0, \rho)} & = \text{cholesky\_decompose}(K(x | 1.0, \rho))
 \end{align*}