From 6273a32ba9fee3321ca3f6090d74ed1b85905fa6 Mon Sep 17 00:00:00 2001
From: tadej <tadej.c@gmail.com>
Date: Thu, 15 Apr 2021 14:16:31 +0200
Subject: [PATCH 1/2] added opencl prim gp cov implementations

# Conflicts:
#	stan/math/opencl/prim.hpp
---
 .../opencl/kernels/gp_exponential_cov.hpp     | 106 +++++++++++
 stan/math/opencl/kernels/gp_matern32_cov.hpp  | 108 +++++++++++
 stan/math/opencl/kernels/gp_matern52_cov.hpp  | 114 ++++++++++++
 stan/math/opencl/prim.hpp                     |   4 +
 stan/math/opencl/prim/gp_dot_prod_cov.hpp     |  65 +++++++
 stan/math/opencl/prim/gp_exponential_cov.hpp  | 169 +++++++++++++++++
 stan/math/opencl/prim/gp_matern32_cov.hpp     | 169 +++++++++++++++++
 stan/math/opencl/prim/gp_matern52_cov.hpp     | 171 ++++++++++++++++++
 .../opencl/prim/gp_exponential_cov_test.cpp   |  86 +++++++++
 .../math/opencl/prim/gp_matern32_cov_test.cpp |  86 +++++++++
 .../math/opencl/prim/gp_matern52_cov_test.cpp |  86 +++++++++
 .../opencl/rev/gp_dot_product_cov_test.cpp    |  77 ++++++++
 12 files changed, 1241 insertions(+)
 create mode 100644 stan/math/opencl/kernels/gp_exponential_cov.hpp
 create mode 100644 stan/math/opencl/kernels/gp_matern32_cov.hpp
 create mode 100644 stan/math/opencl/kernels/gp_matern52_cov.hpp
 create mode 100644 stan/math/opencl/prim/gp_dot_prod_cov.hpp
 create mode 100644 stan/math/opencl/prim/gp_exponential_cov.hpp
 create mode 100644 stan/math/opencl/prim/gp_matern32_cov.hpp
 create mode 100644 stan/math/opencl/prim/gp_matern52_cov.hpp
 create mode 100644 test/unit/math/opencl/prim/gp_exponential_cov_test.cpp
 create mode 100644 test/unit/math/opencl/prim/gp_matern32_cov_test.cpp
 create mode 100644 test/unit/math/opencl/prim/gp_matern52_cov_test.cpp
 create mode 100644 test/unit/math/opencl/rev/gp_dot_product_cov_test.cpp
diff --git a/stan/math/opencl/kernels/gp_exponential_cov.hpp b/stan/math/opencl/kernels/gp_exponential_cov.hpp
new file mode 100644
index 00000000000..c1a836dec05
--- /dev/null
+++ b/stan/math/opencl/kernels/gp_exponential_cov.hpp
@@ -0,0 +1,106 @@
+#ifndef STAN_MATH_OPENCL_KERNELS_GP_EXPONENTIAL_COV_HPP
+#define STAN_MATH_OPENCL_KERNELS_GP_EXPONENTIAL_COV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/kernel_cl.hpp>
+#include <string>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+static const std::string gp_exponential_cov_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * GPU part of calculation of Matern exponential kernel.
+     *
+     * @param[in] x input vector or matrix
+     * @param[out] res squared distances between elements of x
+     * @param sigma_sq squared standard deviation
+     * @param neg_inv_l -1.0 / length_scale
+     * @param size number of elements in x
+     * @param element_size the number of doubles that make one element of x
+     */
+    __kernel void gp_exponential_cov(const __global double* x,
+                                  __global double* res, const double sigma_sq,
+                                  const double neg_inv_l,
+                                  const int size, const int element_size) {
+      const int i = get_global_id(0);
+      const int j = get_global_id(1);
+      if (i < size && j < size) {
+        if (i > j) {
+          double sum = 0;
+          for (int k = 0; k < element_size; k++) {
+            double d = x[i * element_size + k] - x[j * element_size + k];
+            sum += d * d;
+          }
+          double a = sigma_sq * exp(neg_inv_l * sqrt(sum));
+          res[j * size + i] = a;
+          res[i * size + j] = a;
+        } else if (i == j) {
+          res[j * size + i] = sigma_sq;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/gp_exponential_cov.hpp gp_exponential_cov() \endlink
+ */
+const kernel_cl<in_buffer, out_buffer, double, double, int, int>
+    gp_exponential_cov("gp_exponential_cov", {gp_exponential_cov_kernel_code});
+
+// \cond
+static const std::string gp_exponential_cov_cross_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * GPU part of calculation of Matern exponential kernel.
+     *
+     * This function is for the cross covariance
+     * matrix needed to compute the posterior predictive density.
+     *
+     * @param[in] x1 first input vector or matrix
+     * @param[in] x2 second input vector or matrix
+     * @param[out] res squared distances between elements of x
+     * @param sigma_sq squared standard deviation
+     * @param neg_inv_l -1.0 / length_scale
+     * @param size1 number of elements in x1
+     * @param size2 number of elements in x2
+     * @param element_size the number of doubles that make one element of x and
+     * y
+     */
+    __kernel void gp_exponential_cov_cross(
+        const __global double* x1, const __global double* x2,
+        __global double* res, const double sigma_sq,
+        const double neg_inv_l, const int size1, const int size2,
+        const int element_size) {
+      const int i = get_global_id(0);
+      const int j = get_global_id(1);
+      if (i < size1 && j < size2) {
+        double sum = 0;
+        for (int k = 0; k < element_size; k++) {
+          double d = x1[i * element_size + k] - x2[j * element_size + k];
+          sum += d * d;
+        }
+        res[j * size1 + i] = sigma_sq * exp(neg_inv_l * sqrt(sum));
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/gp_exponential_cov.hpp gp_exponential_cov_cross()
+ * \endlink
+ */
+const kernel_cl<in_buffer, in_buffer, out_buffer, double, double, int, int, int>
+    gp_exponential_cov_cross("gp_exponential_cov_cross",
+                          {gp_exponential_cov_cross_kernel_code});
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
diff --git a/stan/math/opencl/kernels/gp_matern32_cov.hpp b/stan/math/opencl/kernels/gp_matern32_cov.hpp
new file mode 100644
index 00000000000..e807e18aeee
--- /dev/null
+++ b/stan/math/opencl/kernels/gp_matern32_cov.hpp
@@ -0,0 +1,108 @@
+#ifndef STAN_MATH_OPENCL_KERNELS_GP_MATERN32_COV_HPP
+#define STAN_MATH_OPENCL_KERNELS_GP_MATERN32_COV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/kernel_cl.hpp>
+#include <string>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+static const std::string gp_matern32_cov_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * GPU part of calculation of Matern 3/2 kernel.
+     *
+     * @param[in] x input vector or matrix
+     * @param[out] res squared distances between elements of x
+     * @param sigma_sq squared standard deviation
+     * @param root_3_inv_l sqrt(3.0) / length_scale
+     * @param size number of elements in x
+     * @param element_size the number of doubles that make one element of x
+     */
+    __kernel void gp_matern32_cov(
+        const __global double* x, __global double* res, const double sigma_sq,
+        const double root_3_inv_l, const int size, const int element_size) {
+      const int i = get_global_id(0);
+      const int j = get_global_id(1);
+      if (i < size && j < size) {
+        if (i > j) {
+          double sum = 0;
+          for (int k = 0; k < element_size; k++) {
+            double d = x[i * element_size + k] - x[j * element_size + k];
+            sum += d * d;
+          }
+          double dist = sqrt(sum);
+          double a = sigma_sq * (1.0 + root_3_inv_l * dist)
+                     * exp(-root_3_inv_l * dist);
+          res[j * size + i] = a;
+          res[i * size + j] = a;
+        } else if (i == j) {
+          res[j * size + i] = sigma_sq;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/gp_matern32_cov.hpp gp_matern32_cov() \endlink
+ */
+const kernel_cl<in_buffer, out_buffer, double, double, int, int>
+    gp_matern32_cov("gp_matern32_cov", {gp_matern32_cov_kernel_code});
+
+// \cond
+static const std::string gp_matern32_cov_cross_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * GPU part of calculation of Matern 3/2 kernel.
+     *
+     * This function is for the cross covariance
+     * matrix needed to compute the posterior predictive density.
+     *
+     * @param[in] x1 first input vector or matrix
+     * @param[in] x2 second input vector or matrix
+     * @param[out] res squared distances between elements of x
+     * @param sigma_sq squared standard deviation
+     * @param root_3_inv_l sqrt(3.0) / length_scale
+     * @param size1 number of elements in x1
+     * @param size2 number of elements in x2
+     * @param element_size the number of doubles that make one element of x and
+     * y
+     */
+    __kernel void gp_matern32_cov_cross(
+        const __global double* x1, const __global double* x2,
+        __global double* res, const double sigma_sq, const double root_3_inv_l,
+        const int size1, const int size2, const int element_size) {
+      const int i = get_global_id(0);
+      const int j = get_global_id(1);
+      if (i < size1 && j < size2) {
+        double sum = 0;
+        for (int k = 0; k < element_size; k++) {
+          double d = x1[i * element_size + k] - x2[j * element_size + k];
+          sum += d * d;
+        }
+        double dist = sqrt(sum);
+        res[j * size1 + i] = sigma_sq * (1.0 + root_3_inv_l * dist)
+                             * exp(-root_3_inv_l * dist);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/gp_matern32_cov.hpp gp_matern32_cov_cross()
+ * \endlink
+ */
+const kernel_cl<in_buffer, in_buffer, out_buffer, double, double, int, int, int>
+    gp_matern32_cov_cross("gp_matern32_cov_cross",
+                          {gp_matern32_cov_cross_kernel_code});
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
diff --git a/stan/math/opencl/kernels/gp_matern52_cov.hpp b/stan/math/opencl/kernels/gp_matern52_cov.hpp
new file mode 100644
index 00000000000..d2cc63ac666
--- /dev/null
+++ b/stan/math/opencl/kernels/gp_matern52_cov.hpp
@@ -0,0 +1,114 @@
+#ifndef STAN_MATH_OPENCL_KERNELS_gp_MATERN52_COV_HPP
+#define STAN_MATH_OPENCL_KERNELS_gp_MATERN52_COV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/kernel_cl.hpp>
+#include <string>
+
+namespace stan {
+namespace math {
+namespace opencl_kernels {
+// \cond
+static const std::string gp_matern52_cov_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * GPU part of calculation of Matern 5/2 kernel.
+     *
+     * @param[in] x input vector or matrix
+     * @param[out] res squared distances between elements of x
+     * @param sigma_sq squared standard deviation
+     * @param root_5_inv_l sqrt(5.0) / length_scale
+     * @param inv_l_sq_5_3 5.0 / 3.0 / square(length_scale)
+     * @param size number of elements in x
+     * @param element_size the number of doubles that make one element of x
+     */
+    __kernel void gp_matern52_cov(
+        const __global double* x, __global double* res, const double sigma_sq,
+        const double root_5_inv_l, const double inv_l_sq_5_3, const int size,
+        const int element_size) {
+      const int i = get_global_id(0);
+      const int j = get_global_id(1);
+      if (i < size && j < size) {
+        if (i > j) {
+          double sum = 0;
+          for (int k = 0; k < element_size; k++) {
+            double d = x[i * element_size + k] - x[j * element_size + k];
+            sum += d * d;
+          }
+          double dist = sqrt(sum);
+          double a = sigma_sq * (1.0 + root_5_inv_l * dist + inv_l_sq_5_3 * sum)
+                     * exp(-root_5_inv_l * dist);
+          res[j * size + i] = a;
+          res[i * size + j] = a;
+        } else if (i == j) {
+          res[j * size + i] = sigma_sq;
+        }
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/gp_matern52_cov.hpp gp_matern52_cov() \endlink
+ */
+const kernel_cl<in_buffer, out_buffer, double, double, double, int, int>
+    gp_matern52_cov("gp_matern52_cov", {gp_matern52_cov_kernel_code});
+
+// \cond
+static const std::string gp_matern52_cov_cross_kernel_code = STRINGIFY(
+    // \endcond
+    /** \ingroup opencl_kernels
+     * GPU part of calculation of Matern 5/2 kernel.
+     *
+     * This function is for the cross covariance
+     * matrix needed to compute the posterior predictive density.
+     *
+     * @param[in] x1 first input vector or matrix
+     * @param[in] x2 second input vector or matrix
+     * @param[out] res squared distances between elements of x
+     * @param sigma_sq squared standard deviation
+     * @param root_5_inv_l sqrt(5.0) / length_scale
+     * @param inv_l_sq_5_3 5.0 / 3.0 / square(length_scale)
+     * @param size1 number of elements in x1
+     * @param size2 number of elements in x2
+     * @param element_size the number of doubles that make one element of x and
+     * y
+     */
+    __kernel void gp_matern52_cov_cross(
+        const __global double* x1, const __global double* x2,
+        __global double* res, const double sigma_sq, const double root_5_inv_l,
+        const double inv_l_sq_5_3, const int size1, const int size2,
+        const int element_size) {
+      const int i = get_global_id(0);
+      const int j = get_global_id(1);
+      if (i < size1 && j < size2) {
+        double sum = 0;
+        for (int k = 0; k < element_size; k++) {
+          double d = x1[i * element_size + k] - x2[j * element_size + k];
+          sum += d * d;
+        }
+        double dist = sqrt(sum);
+        res[j * size1 + i] = sigma_sq
+                             * (1.0 + root_5_inv_l * dist + inv_l_sq_5_3 * sum)
+                             * exp(-root_5_inv_l * dist);
+      }
+    }
+    // \cond
+);
+// \endcond
+
+/** \ingroup opencl_kernels
+ * See the docs for \link kernels/gp_matern52_cov.hpp gp_matern52_cov_cross()
+ * \endlink
+ */
+const kernel_cl<in_buffer, in_buffer, out_buffer, double, double, double, int,
+                int, int>
+    gp_matern52_cov_cross("gp_matern52_cov_cross",
+                          {gp_matern52_cov_cross_kernel_code});
+
+}  // namespace opencl_kernels
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
diff --git a/stan/math/opencl/prim.hpp b/stan/math/opencl/prim.hpp
index 6dee8701bfe..32223f2e86c 100644
--- a/stan/math/opencl/prim.hpp
+++ b/stan/math/opencl/prim.hpp
@@ -151,7 +151,11 @@
 #include <stan/math/opencl/prim/frechet_lcdf.hpp>
 #include <stan/math/opencl/prim/frechet_lpdf.hpp>
 #include <stan/math/opencl/prim/gamma_lpdf.hpp>
+#include <stan/math/opencl/prim/gp_dot_prod_cov.hpp>
+#include <stan/math/opencl/prim/gp_exponential_cov.hpp>
 #include <stan/math/opencl/prim/gp_exp_quad_cov.hpp>
+#include <stan/math/opencl/prim/gp_matern32_cov.hpp>
+#include <stan/math/opencl/prim/gp_matern52_cov.hpp>
 #include <stan/math/opencl/prim/gumbel_cdf.hpp>
 #include <stan/math/opencl/prim/gumbel_lccdf.hpp>
 #include <stan/math/opencl/prim/gumbel_lcdf.hpp>
diff --git a/stan/math/opencl/prim/gp_dot_prod_cov.hpp b/stan/math/opencl/prim/gp_dot_prod_cov.hpp
new file mode 100644
index 00000000000..471665c06fc
--- /dev/null
+++ b/stan/math/opencl/prim/gp_dot_prod_cov.hpp
@@ -0,0 +1,65 @@
+#ifndef STAN_MATH_OPENCL_PRIM_GP_DOT_PROD_COV_HPP
+#define STAN_MATH_OPENCL_PRIM_GP_DOT_PROD_COV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/kernel_generator.hpp>
+#include <stan/math/prim/fun/square.hpp>
+#include <stan/math/prim/fun/value_of.hpp>
+
+namespace stan {
+namespace math {
+
+/** \ingroup opencl
+ * Dot product kernel on the GPU.
+ *
+ * @tparam T1 Type of the matrix
+ * @tparam T2 Type of sigma
+ * @param x input matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return dot product covariance matrix that is positive semi-definite
+ */
+template <typename T_x, typename T_sigma,
+          require_all_prim_or_rev_kernel_expression_t<T_x>* = nullptr,
+          require_stan_scalar_t<T_sigma>* = nullptr>
+inline auto gp_dot_prod_cov(const T_x& x,
+                               const T_sigma sigma) {
+  const char* fun = "gp_dot_prod_cov(OpenCL)";
+  check_nonnegative(fun, "sigma", sigma);
+  check_finite(fun, "sigma", sigma);
+  const auto& x_val = value_of(x);
+  check_cl(fun, "x", x_val, "not NaN") = !isnan(x_val);
+  return add(square(sigma), transpose(x) * x);
+}
+
+/** \ingroup opencl
+ * Dot product kernel on the GPU.
+ *
+ * @tparam T1 Type of the matrix
+ * @tparam T2 Type of sigma
+ * @param x input matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return dot product covariance matrix
+ */
+template <typename T_x, typename T_y, typename T_sigma,
+          require_all_prim_or_rev_kernel_expression_t<T_x, T_y>* = nullptr,
+          require_stan_scalar_t<T_sigma>* = nullptr>
+inline auto gp_dot_prod_cov(const T_x& x, const T_y& y,
+                               const T_sigma sigma) {
+  const char* fun = "gp_dot_prod_cov(OpenCL)";
+  check_nonnegative(fun, "sigma", sigma);
+  check_finite(fun, "sigma", sigma);
+  const auto& x_val = value_of(x);
+  const auto& y_val = value_of(y);
+  check_cl(fun, "x", x_val, "not NaN") = !isnan(x_val);
+  check_cl(fun, "y", y_val, "not NaN") = !isnan(y_val);
+  return add(square(sigma), transpose(x) * y);
+}
+}  // namespace math
+}  // namespace stan
+#endif
+#endif
diff --git a/stan/math/opencl/prim/gp_exponential_cov.hpp b/stan/math/opencl/prim/gp_exponential_cov.hpp
new file mode 100644
index 00000000000..aa54f924a24
--- /dev/null
+++ b/stan/math/opencl/prim/gp_exponential_cov.hpp
@@ -0,0 +1,169 @@
+#ifndef STAN_MATH_OPENCL_PRIM_GP_EXPONENTIAL_COV_HPP
+#define STAN_MATH_OPENCL_PRIM_GP_EXPONENTIAL_COV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/prim/meta.hpp>
+#include <stan/math/prim/fun/square.hpp>
+#include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/kernels/gp_exponential_cov.hpp>
+#include <stan/math/opencl/err.hpp>
+#include <CL/cl2.hpp>
+
+namespace stan {
+namespace math {
+/** \ingroup opencl
+ * Matern exponential kernel on the GPU.
+ *
+ * @tparam T1 Type of the matrix
+ * @tparam T2 Type of sigma
+ * @tparam T3 Type of length_scale
+ * @param x input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return dot product covariance matrix that is positive semi-definite
+ */
+template <typename T1, typename T2, typename T3,
+          require_all_kernel_expressions_and_none_scalar_t<T1>* = nullptr,
+          require_all_arithmetic_t<T2, T3>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3>> gp_exponential_cov(
+    const T1& x, const T2 sigma, const T3 length_scale) {
+  const auto& x_eval = x.eval();
+  matrix_cl<return_type_t<T1, T2, T3>> res(x.cols(), x.cols());
+  int block_size = 16;
+  int n_blocks = (x.cols() + block_size - 1) / block_size;
+  int blocked_size = block_size * n_blocks;
+  try {
+    opencl_kernels::gp_exponential_cov(
+        cl::NDRange(blocked_size, blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, res, sigma * sigma,
+        -1.0 / length_scale, x.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_exponential_cov", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Matern exponential kernel on the GPU.
+ *
+ * This function is for the cross covariance
+ * matrix needed to compute the posterior predictive density.
+ *
+ * @tparam T1 Type of the first matrix
+ * @tparam T2 Type of the second matrix
+ * @tparam T3 Type of sigma
+ * @tparam T4 Type of length scale
+ * @param x first input vector or matrix
+ * @param y second input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return dot product covariance matrix
+ */
+template <typename T1, typename T2, typename T3, typename T4,
+          require_all_kernel_expressions_and_none_scalar_t<T1, T2>* = nullptr,
+          require_all_arithmetic_t<T3, T4>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3, T4>> gp_exponential_cov(
+    const T1& x, const T2& y, const T3 sigma, const T4 length_scale) {
+  check_size_match("gp_exponential_cov_cross", "x", x.rows(), "y", y.rows());
+  matrix_cl<return_type_t<T1, T2, T3, T4>> res(x.cols(), y.cols());
+  const auto& x_eval = x.eval();
+  const auto& y_eval = y.eval();
+  int block_size = 16;
+  int x_blocks = (x.cols() + block_size - 1) / block_size;
+  int x_blocked_size = block_size * x_blocks;
+  int y_blocks = (y.cols() + block_size - 1) / block_size;
+  int y_blocked_size = block_size * y_blocks;
+  try {
+    opencl_kernels::gp_exponential_cov_cross(
+        cl::NDRange(x_blocked_size, y_blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, y_eval, res, sigma * sigma,
+        -1.0 / length_scale, x.cols(), y.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_exponential_cov_cross", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Squared exponential kernel on the GPU.
+ *
+ * @tparam T1 Type of the matrix
+ * @tparam T2 Type of sigma
+ * @tparam T3 Type of length_scale
+ * @param x input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Squared distance between elements of x.
+ */
+template <typename T1, typename T2, typename T3,
+          require_all_kernel_expressions_and_none_scalar_t<T1, T3>* = nullptr,
+          require_all_arithmetic_t<T2>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3>> gp_exponential_cov(
+    const T1& x, const T2 sigma, const T3 length_scale) {
+  const auto& x_eval = elt_divide(x, rowwise_broadcast(length_scale)).eval();
+  matrix_cl<return_type_t<T1, T2, T3>> res(x.cols(), x.cols());
+  int block_size = 16;
+  int n_blocks = (x.cols() + block_size - 1) / block_size;
+  int blocked_size = block_size * n_blocks;
+  try {
+    opencl_kernels::gp_exponential_cov(cl::NDRange(blocked_size, blocked_size),
+                                    cl::NDRange(block_size, block_size), x_eval,
+                                    res, sigma * sigma, -1.0, x.cols(),
+                                    x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_exponential_cov", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Squared exponential kernel on the GPU.
+ *
+ * This function is for the cross covariance
+ * matrix needed to compute the posterior predictive density.
+ *
+ * @tparam T1 Type of the first matrix
+ * @tparam T2 Type of the second matrix
+ * @tparam T3 Type of sigma
+ * @tparam T4 Type of length scale
+ * @param x first input vector or matrix
+ * @param y second input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Squared distance between elements of x and y.
+ */
+template <
+    typename T1, typename T2, typename T3, typename T4,
+    require_all_kernel_expressions_and_none_scalar_t<T1, T2, T4>* = nullptr,
+    require_all_arithmetic_t<T3>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3, T4>> gp_exponential_cov(
+    const T1& x, const T2& y, const T3 sigma, const T4 length_scale) {
+  check_size_match("gp_exponential_cov_cross", "x", x.rows(), "y", y.rows());
+  matrix_cl<return_type_t<T1, T2, T3, T4>> res(x.cols(), y.cols());
+  const auto& x_eval = elt_divide(x, rowwise_broadcast(length_scale)).eval();
+  const auto& y_eval = elt_divide(y, rowwise_broadcast(length_scale)).eval();
+  int block_size = 16;
+  int x_blocks = (x.cols() + block_size - 1) / block_size;
+  int x_blocked_size = block_size * x_blocks;
+  int y_blocks = (y.cols() + block_size - 1) / block_size;
+  int y_blocked_size = block_size * y_blocks;
+  try {
+    opencl_kernels::gp_exponential_cov_cross(
+        cl::NDRange(x_blocked_size, y_blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, y_eval, res, sigma * sigma,
+        -1.0, x.cols(), y.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_exponential_cov_cross", e);
+  }
+  return res;
+}
+
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
diff --git a/stan/math/opencl/prim/gp_matern32_cov.hpp b/stan/math/opencl/prim/gp_matern32_cov.hpp
new file mode 100644
index 00000000000..19afc632d26
--- /dev/null
+++ b/stan/math/opencl/prim/gp_matern32_cov.hpp
@@ -0,0 +1,169 @@
+#ifndef STAN_MATH_OPENCL_PRIM_GP_MATERN32_COV_HPP
+#define STAN_MATH_OPENCL_PRIM_GP_MATERN32_COV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/prim/meta.hpp>
+#include <stan/math/prim/fun/square.hpp>
+#include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/kernels/gp_matern32_cov.hpp>
+#include <stan/math/opencl/err.hpp>
+#include <CL/cl2.hpp>
+
+namespace stan {
+namespace math {
+/** \ingroup opencl
+ * Matern 3/2 kernel on the GPU.
+ *
+ * @tparam T1 Type of the matrix
+ * @tparam T2 Type of sigma
+ * @tparam T3 Type of length_scale
+ * @param x input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Matern 3/2 covariance matrix
+ */
+template <typename T1, typename T2, typename T3,
+          require_all_kernel_expressions_and_none_scalar_t<T1>* = nullptr,
+          require_all_arithmetic_t<T2, T3>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3>> gp_matern32_cov(
+    const T1& x, const T2 sigma, const T3 length_scale) {
+  const auto& x_eval = x.eval();
+  matrix_cl<return_type_t<T1, T2, T3>> res(x.cols(), x.cols());
+  int block_size = 16;
+  int n_blocks = (x.cols() + block_size - 1) / block_size;
+  int blocked_size = block_size * n_blocks;
+  try {
+    opencl_kernels::gp_matern32_cov(
+        cl::NDRange(blocked_size, blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, res, sigma * sigma,
+        std::sqrt(3.0) / length_scale, x.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_matern32_cov", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Matern 3/2 kernel on the GPU.
+ *
+ * This function is for the cross covariance
+ * matrix needed to compute the posterior predictive density.
+ *
+ * @tparam T1 Type of the first matrix
+ * @tparam T2 Type of the second matrix
+ * @tparam T3 Type of sigma
+ * @tparam T4 Type of length scale
+ * @param x first input vector or matrix
+ * @param y second input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Matern 3/2 covariance matrix
+ */
+template <typename T1, typename T2, typename T3, typename T4,
+          require_all_kernel_expressions_and_none_scalar_t<T1, T2>* = nullptr,
+          require_all_arithmetic_t<T3, T4>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3, T4>> gp_matern32_cov(
+    const T1& x, const T2& y, const T3 sigma, const T4 length_scale) {
+  check_size_match("gp_matern32_cov_cross", "x", x.rows(), "y", y.rows());
+  matrix_cl<return_type_t<T1, T2, T3, T4>> res(x.cols(), y.cols());
+  const auto& x_eval = x.eval();
+  const auto& y_eval = y.eval();
+  int block_size = 16;
+  int x_blocks = (x.cols() + block_size - 1) / block_size;
+  int x_blocked_size = block_size * x_blocks;
+  int y_blocks = (y.cols() + block_size - 1) / block_size;
+  int y_blocked_size = block_size * y_blocks;
+  try {
+    opencl_kernels::gp_matern32_cov_cross(
+        cl::NDRange(x_blocked_size, y_blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, y_eval, res, sigma * sigma,
+        std::sqrt(3.0) / length_scale, x.cols(), y.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_matern32_cov_cross", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Matern 3/2 kernel on the GPU.
+ *
+ * @tparam T1 Type of the matrix
+ * @tparam T2 Type of sigma
+ * @tparam T3 Type of length_scale
+ * @param x input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Matern 3/2 covariance matrix
+ */
+template <typename T1, typename T2, typename T3,
+          require_all_kernel_expressions_and_none_scalar_t<T1, T3>* = nullptr,
+          require_all_arithmetic_t<T2>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3>> gp_matern32_cov(
+    const T1& x, const T2 sigma, const T3 length_scale) {
+  const auto& x_eval = elt_divide(x, rowwise_broadcast(length_scale)).eval();
+  matrix_cl<return_type_t<T1, T2, T3>> res(x.cols(), x.cols());
+  int block_size = 16;
+  int n_blocks = (x.cols() + block_size - 1) / block_size;
+  int blocked_size = block_size * n_blocks;
+  try {
+    opencl_kernels::gp_matern32_cov(cl::NDRange(blocked_size, blocked_size),
+                                    cl::NDRange(block_size, block_size), x_eval,
+                                    res, sigma * sigma, std::sqrt(3.0), x.cols(),
+                                    x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_matern32_cov", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Matern 3/2 kernel on the GPU.
+ *
+ * This function is for the cross covariance
+ * matrix needed to compute the posterior predictive density.
+ *
+ * @tparam T1 Type of the first matrix
+ * @tparam T2 Type of the second matrix
+ * @tparam T3 Type of sigma
+ * @tparam T4 Type of length scale
+ * @param x first input vector or matrix
+ * @param y second input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Matern 3/2 covariance matrix
+ */
+template <
+    typename T1, typename T2, typename T3, typename T4,
+    require_all_kernel_expressions_and_none_scalar_t<T1, T2, T4>* = nullptr,
+    require_all_arithmetic_t<T3>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3, T4>> gp_matern32_cov(
+    const T1& x, const T2& y, const T3 sigma, const T4 length_scale) {
+  check_size_match("gp_matern32_cov_cross", "x", x.rows(), "y", y.rows());
+  matrix_cl<return_type_t<T1, T2, T3, T4>> res(x.cols(), y.cols());
+  const auto& x_eval = elt_divide(x, rowwise_broadcast(length_scale)).eval();
+  const auto& y_eval = elt_divide(y, rowwise_broadcast(length_scale)).eval();
+  int block_size = 16;
+  int x_blocks = (x.cols() + block_size - 1) / block_size;
+  int x_blocked_size = block_size * x_blocks;
+  int y_blocks = (y.cols() + block_size - 1) / block_size;
+  int y_blocked_size = block_size * y_blocks;
+  try {
+    opencl_kernels::gp_matern32_cov_cross(
+        cl::NDRange(x_blocked_size, y_blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, y_eval, res, sigma * sigma,
+        std::sqrt(3.0), x.cols(), y.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_matern32_cov_cross", e);
+  }
+  return res;
+}
+
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
diff --git a/stan/math/opencl/prim/gp_matern52_cov.hpp b/stan/math/opencl/prim/gp_matern52_cov.hpp
new file mode 100644
index 00000000000..e0ea45f8f35
--- /dev/null
+++ b/stan/math/opencl/prim/gp_matern52_cov.hpp
@@ -0,0 +1,171 @@
+#ifndef STAN_MATH_OPENCL_PRIM_GP_MATERN52_COV_HPP
+#define STAN_MATH_OPENCL_PRIM_GP_MATERN52_COV_HPP
+#ifdef STAN_OPENCL
+
+#include <stan/math/prim/meta.hpp>
+#include <stan/math/prim/fun/square.hpp>
+#include <stan/math/opencl/matrix_cl.hpp>
+#include <stan/math/opencl/kernels/gp_matern52_cov.hpp>
+#include <stan/math/opencl/err.hpp>
+#include <CL/cl2.hpp>
+
+namespace stan {
+namespace math {
+/** \ingroup opencl
+ * Matern 5/2 kernel on the GPU.
+ *
+ * @tparam T1 Type of the matrix
+ * @tparam T2 Type of sigma
+ * @tparam T3 Type of length_scale
+ * @param x input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Matern 5/2 covariance matrix
+ */
+template <typename T1, typename T2, typename T3,
+          require_all_kernel_expressions_and_none_scalar_t<T1>* = nullptr,
+          require_all_arithmetic_t<T2, T3>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3>> gp_matern52_cov(
+    const T1& x, const T2 sigma, const T3 length_scale) {
+  const auto& x_eval = x.eval();
+  matrix_cl<return_type_t<T1, T2, T3>> res(x.cols(), x.cols());
+  int block_size = 16;
+  int n_blocks = (x.cols() + block_size - 1) / block_size;
+  int blocked_size = block_size * n_blocks;
+  try {
+    opencl_kernels::gp_matern52_cov(
+        cl::NDRange(blocked_size, blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, res, sigma * sigma,
+        std::sqrt(5.0) / length_scale, 5.0 / (3.0 * square(length_scale)),
+        x.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_matern52_cov", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Matern 5/2 kernel on the GPU.
+ *
+ * This function is for the cross covariance
+ * matrix needed to compute the posterior predictive density.
+ *
+ * @tparam T1 Type of the first matrix
+ * @tparam T2 Type of the second matrix
+ * @tparam T3 Type of sigma
+ * @tparam T4 Type of length scale
+ * @param x first input vector or matrix
+ * @param y second input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Matern 5/2 covariance matrix
+ */
+template <typename T1, typename T2, typename T3, typename T4,
+          require_all_kernel_expressions_and_none_scalar_t<T1, T2>* = nullptr,
+          require_all_arithmetic_t<T3, T4>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3, T4>> gp_matern52_cov(
+    const T1& x, const T2& y, const T3 sigma, const T4 length_scale) {
+  check_size_match("gp_matern52_cov_cross", "x", x.rows(), "y", y.rows());
+  matrix_cl<return_type_t<T1, T2, T3, T4>> res(x.cols(), y.cols());
+  const auto& x_eval = x.eval();
+  const auto& y_eval = y.eval();
+  int block_size = 16;
+  int x_blocks = (x.cols() + block_size - 1) / block_size;
+  int x_blocked_size = block_size * x_blocks;
+  int y_blocks = (y.cols() + block_size - 1) / block_size;
+  int y_blocked_size = block_size * y_blocks;
+  try {
+    opencl_kernels::gp_matern52_cov_cross(
+        cl::NDRange(x_blocked_size, y_blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, y_eval, res, sigma * sigma,
+        std::sqrt(5.0) / length_scale, 5.0 / (3.0 * square(length_scale)),
+        x.cols(), y.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_matern52_cov_cross", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Matern 5/2 kernel on the GPU.
+ *
+ * @tparam T1 Type of the matrix
+ * @tparam T2 Type of sigma
+ * @tparam T3 Type of length_scale
+ * @param x input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Matern 5/2 covariance matrix
+ */
+template <typename T1, typename T2, typename T3,
+          require_all_kernel_expressions_and_none_scalar_t<T1, T3>* = nullptr,
+          require_all_arithmetic_t<T2>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3>> gp_matern52_cov(
+    const T1& x, const T2 sigma, const T3 length_scale) {
+  const auto& x_eval = elt_divide(x, rowwise_broadcast(length_scale)).eval();
+  matrix_cl<return_type_t<T1, T2, T3>> res(x.cols(), x.cols());
+  int block_size = 16;
+  int n_blocks = (x.cols() + block_size - 1) / block_size;
+  int blocked_size = block_size * n_blocks;
+  try {
+    opencl_kernels::gp_matern52_cov(cl::NDRange(blocked_size, blocked_size),
+                                    cl::NDRange(block_size, block_size), x_eval,
+                                    res, sigma * sigma, std::sqrt(5.0),
+                                    5.0 / 3.0, x.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_matern52_cov", e);
+  }
+  return res;
+}
+
+/** \ingroup opencl
+ * Matern 5/2 kernel on the GPU.
+ *
+ * This function is for the cross covariance
+ * matrix needed to compute the posterior predictive density.
+ *
+ * @tparam T1 Type of the first matrix
+ * @tparam T2 Type of the second matrix
+ * @tparam T3 Type of sigma
+ * @tparam T4 Type of length scale
+ * @param x first input vector or matrix
+ * @param y second input vector or matrix
+ * @param sigma standard deviation
+ * @param length_scale length scale
+ *
+ * @return Matern 5/2 covariance matrix
+ */
+template <
+    typename T1, typename T2, typename T3, typename T4,
+    require_all_kernel_expressions_and_none_scalar_t<T1, T2, T4>* = nullptr,
+    require_all_arithmetic_t<T3>* = nullptr>
+inline matrix_cl<return_type_t<T1, T2, T3, T4>> gp_matern52_cov(
+    const T1& x, const T2& y, const T3 sigma, const T4 length_scale) {
+  check_size_match("gp_matern52_cov_cross", "x", x.rows(), "y", y.rows());
+  matrix_cl<return_type_t<T1, T2, T3, T4>> res(x.cols(), y.cols());
+  const auto& x_eval = elt_divide(x, rowwise_broadcast(length_scale)).eval();
+  const auto& y_eval = elt_divide(y, rowwise_broadcast(length_scale)).eval();
+  int block_size = 16;
+  int x_blocks = (x.cols() + block_size - 1) / block_size;
+  int x_blocked_size = block_size * x_blocks;
+  int y_blocks = (y.cols() + block_size - 1) / block_size;
+  int y_blocked_size = block_size * y_blocks;
+  try {
+    opencl_kernels::gp_matern52_cov_cross(
+        cl::NDRange(x_blocked_size, y_blocked_size),
+        cl::NDRange(block_size, block_size), x_eval, y_eval, res, sigma * sigma,
+        std::sqrt(5.0), 5.0 / 3.0, x.cols(), y.cols(), x.rows());
+  } catch (const cl::Error& e) {
+    check_opencl_error("gp_matern52_cov_cross", e);
+  }
+  return res;
+}
+
+}  // namespace math
+}  // namespace stan
+
+#endif
+#endif
diff --git a/test/unit/math/opencl/prim/gp_exponential_cov_test.cpp b/test/unit/math/opencl/prim/gp_exponential_cov_test.cpp
new file mode 100644
index 00000000000..d93518a39a4
--- /dev/null
+++ b/test/unit/math/opencl/prim/gp_exponential_cov_test.cpp
@@ -0,0 +1,86 @@
+#ifdef STAN_OPENCL
+#include <stan/math.hpp>
+#include <test/unit/math/opencl/util.hpp>
+#include <test/unit/util.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+
+TEST(OpenCLPrimGpExponentialCov, exceptions) {
+  Eigen::VectorXd a(3);
+  a << 1, 2, 3;
+  Eigen::VectorXd b(2);
+  b << -3, 4;
+  std::vector<Eigen::VectorXd> x1{a, a};
+  std::vector<Eigen::VectorXd> x2{b, b};
+  stan::math::matrix_cl<double> x1_cl(x1);
+  stan::math::matrix_cl<double> x2_cl(x2);
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+  std::vector<double> l2 = {1.2, 0.7, 2.3};
+  stan::math::matrix_cl<double> l2_cl(l2);
+
+  EXPECT_THROW(stan::math::gp_exponential_cov(x1_cl, x2_cl, sigma, l1),
+               std::invalid_argument);
+  EXPECT_THROW(stan::math::gp_exponential_cov(x1_cl, x2_cl, sigma, l2_cl),
+               std::invalid_argument);
+}
+
+auto gp_exponential_cov1 = [](const auto x, const auto sigma, const auto l) {
+  return stan::math::gp_exponential_cov(x, sigma, l);
+};
+auto gp_exponential_cov2
+    = [](const auto x1, const auto x2, const auto sigma, const auto l) {
+        return stan::math::gp_exponential_cov(x1, x2, sigma, l);
+      };
+
+TEST(OpenCLPrimGpExponentialCov, small) {
+  Eigen::VectorXd a(3);
+  a << 1, 2, 3;
+  Eigen::VectorXd b(3);
+  b << -3, 4, -1;
+  Eigen::VectorXd c(3);
+  c << 4, -5, 3;
+  Eigen::VectorXd d(3);
+  d << -4, 5, 5;
+  std::vector<Eigen::VectorXd> x1{a, b, c};
+  std::vector<Eigen::VectorXd> x2{c, d, d, d};
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+  std::vector<double> l2 = {1.2, 0.7, 2.3};
+
+  stan::math::test::compare_cpu_opencl_prim(gp_exponential_cov1, x1, sigma, l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_exponential_cov2, x1, x2, sigma,
+                                            l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_exponential_cov1, x1, sigma, l2);
+  stan::math::test::compare_cpu_opencl_prim(gp_exponential_cov2, x1, x2, sigma,
+                                            l2);
+}
+
+TEST(OpenCLPrimGpExponentialCov, large) {
+  int N1 = 67;
+  int N2 = 73;
+  std::vector<Eigen::VectorXd> x1;
+  std::vector<double> l2;
+  for (int i = 0; i < N1; i++) {
+    x1.push_back(Eigen::VectorXd::Random(N1));
+    l2.push_back(abs(Eigen::VectorXd::Random(1)[0]));
+  }
+  std::vector<Eigen::VectorXd> x2;
+  for (int i = 0; i < N2; i++) {
+    x2.push_back(Eigen::VectorXd::Random(N1));
+  }
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+
+  stan::math::test::compare_cpu_opencl_prim(gp_exponential_cov1, x1, sigma, l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_exponential_cov2, x1, x2, sigma,
+                                            l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_exponential_cov1, x1, sigma, l2);
+  stan::math::test::compare_cpu_opencl_prim(gp_exponential_cov2, x1, x2, sigma,
+                                            l2);
+}
+
+#endif
diff --git a/test/unit/math/opencl/prim/gp_matern32_cov_test.cpp b/test/unit/math/opencl/prim/gp_matern32_cov_test.cpp
new file mode 100644
index 00000000000..79cfe1aa065
--- /dev/null
+++ b/test/unit/math/opencl/prim/gp_matern32_cov_test.cpp
@@ -0,0 +1,86 @@
+#ifdef STAN_OPENCL
+#include <stan/math.hpp>
+#include <test/unit/math/opencl/util.hpp>
+#include <test/unit/util.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+
+TEST(OpenCLPrimGpMatern32Cov, exceptions) {
+  Eigen::VectorXd a(3);
+  a << 1, 2, 3;
+  Eigen::VectorXd b(2);
+  b << -3, 4;
+  std::vector<Eigen::VectorXd> x1{a, a};
+  std::vector<Eigen::VectorXd> x2{b, b};
+  stan::math::matrix_cl<double> x1_cl(x1);
+  stan::math::matrix_cl<double> x2_cl(x2);
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+  std::vector<double> l2 = {1.2, 0.7, 2.3};
+  stan::math::matrix_cl<double> l2_cl(l2);
+
+  EXPECT_THROW(stan::math::gp_matern32_cov(x1_cl, x2_cl, sigma, l1),
+               std::invalid_argument);
+  EXPECT_THROW(stan::math::gp_matern32_cov(x1_cl, x2_cl, sigma, l2_cl),
+               std::invalid_argument);
+}
+
+auto gp_matern32_cov1 = [](const auto x, const auto sigma, const auto l) {
+  return stan::math::gp_matern32_cov(x, sigma, l);
+};
+auto gp_matern32_cov2
+    = [](const auto x1, const auto x2, const auto sigma, const auto l) {
+        return stan::math::gp_matern32_cov(x1, x2, sigma, l);
+      };
+
+TEST(OpenCLPrimGpMatern32Cov, small) {
+  Eigen::VectorXd a(3);
+  a << 1, 2, 3;
+  Eigen::VectorXd b(3);
+  b << -3, 4, -1;
+  Eigen::VectorXd c(3);
+  c << 4, -5, 3;
+  Eigen::VectorXd d(3);
+  d << -4, 5, 5;
+  std::vector<Eigen::VectorXd> x1{a, b, c};
+  std::vector<Eigen::VectorXd> x2{c, d, d, d};
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+  std::vector<double> l2 = {1.2, 0.7, 2.3};
+
+  stan::math::test::compare_cpu_opencl_prim(gp_matern32_cov1, x1, sigma, l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern32_cov2, x1, x2, sigma,
+                                            l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern32_cov1, x1, sigma, l2);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern32_cov2, x1, x2, sigma,
+                                            l2);
+}
+
+TEST(OpenCLPrimGpMatern32Cov, large) {
+  int N1 = 67;
+  int N2 = 73;
+  std::vector<Eigen::VectorXd> x1;
+  std::vector<double> l2;
+  for (int i = 0; i < N1; i++) {
+    x1.push_back(Eigen::VectorXd::Random(N1));
+    l2.push_back(abs(Eigen::VectorXd::Random(1)[0]));
+  }
+  std::vector<Eigen::VectorXd> x2;
+  for (int i = 0; i < N2; i++) {
+    x2.push_back(Eigen::VectorXd::Random(N1));
+  }
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+
+  stan::math::test::compare_cpu_opencl_prim(gp_matern32_cov1, x1, sigma, l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern32_cov2, x1, x2, sigma,
+                                            l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern32_cov1, x1, sigma, l2);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern32_cov2, x1, x2, sigma,
+                                            l2);
+}
+
+#endif
diff --git a/test/unit/math/opencl/prim/gp_matern52_cov_test.cpp b/test/unit/math/opencl/prim/gp_matern52_cov_test.cpp
new file mode 100644
index 00000000000..3441c0824b1
--- /dev/null
+++ b/test/unit/math/opencl/prim/gp_matern52_cov_test.cpp
@@ -0,0 +1,86 @@
+#ifdef STAN_OPENCL
+#include <stan/math.hpp>
+#include <test/unit/math/opencl/util.hpp>
+#include <test/unit/util.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+
+TEST(OpenCLPrimGpMatern52Cov, exceptions) {
+  Eigen::VectorXd a(3);
+  a << 1, 2, 3;
+  Eigen::VectorXd b(2);
+  b << -3, 4;
+  std::vector<Eigen::VectorXd> x1{a, a};
+  std::vector<Eigen::VectorXd> x2{b, b};
+  stan::math::matrix_cl<double> x1_cl(x1);
+  stan::math::matrix_cl<double> x2_cl(x2);
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+  std::vector<double> l2 = {1.2, 0.7, 2.3};
+  stan::math::matrix_cl<double> l2_cl(l2);
+
+  EXPECT_THROW(stan::math::gp_matern52_cov(x1_cl, x2_cl, sigma, l1),
+               std::invalid_argument);
+  EXPECT_THROW(stan::math::gp_matern52_cov(x1_cl, x2_cl, sigma, l2_cl),
+               std::invalid_argument);
+}
+
+auto gp_matern52_cov1 = [](const auto x, const auto sigma, const auto l) {
+  return stan::math::gp_matern52_cov(x, sigma, l);
+};
+auto gp_matern52_cov2
+    = [](const auto x1, const auto x2, const auto sigma, const auto l) {
+        return stan::math::gp_matern52_cov(x1, x2, sigma, l);
+      };
+
+TEST(OpenCLPrimGpMatern52Cov, small) {
+  Eigen::VectorXd a(3);
+  a << 1, 2, 3;
+  Eigen::VectorXd b(3);
+  b << -3, 4, -1;
+  Eigen::VectorXd c(3);
+  c << 4, -5, 3;
+  Eigen::VectorXd d(3);
+  d << -4, 5, 5;
+  std::vector<Eigen::VectorXd> x1{a, b, c};
+  std::vector<Eigen::VectorXd> x2{c, d, d, d};
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+  std::vector<double> l2 = {1.2, 0.7, 2.3};
+
+  stan::math::test::compare_cpu_opencl_prim(gp_matern52_cov1, x1, sigma, l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern52_cov2, x1, x2, sigma,
+                                            l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern52_cov1, x1, sigma, l2);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern52_cov2, x1, x2, sigma,
+                                            l2);
+}
+
+TEST(OpenCLPrimGpMatern52Cov, large) {
+  int N1 = 67;
+  int N2 = 73;
+  std::vector<Eigen::VectorXd> x1;
+  std::vector<double> l2;
+  for (int i = 0; i < N1; i++) {
+    x1.push_back(Eigen::VectorXd::Random(N1));
+    l2.push_back(abs(Eigen::VectorXd::Random(1)[0]));
+  }
+  std::vector<Eigen::VectorXd> x2;
+  for (int i = 0; i < N2; i++) {
+    x2.push_back(Eigen::VectorXd::Random(N1));
+  }
+
+  double sigma = 1.3;
+  double l1 = 1.4;
+
+  stan::math::test::compare_cpu_opencl_prim(gp_matern52_cov1, x1, sigma, l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern52_cov2, x1, x2, sigma,
+                                            l1);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern52_cov1, x1, sigma, l2);
+  stan::math::test::compare_cpu_opencl_prim(gp_matern52_cov2, x1, x2, sigma,
+                                            l2);
+}
+
+#endif
diff --git a/test/unit/math/opencl/rev/gp_dot_product_cov_test.cpp b/test/unit/math/opencl/rev/gp_dot_product_cov_test.cpp
new file mode 100644
index 00000000000..af9a42b231d
--- /dev/null
+++ b/test/unit/math/opencl/rev/gp_dot_product_cov_test.cpp
@@ -0,0 +1,77 @@
+#ifdef STAN_OPENCL
+#include <stan/math.hpp>
+#include <test/unit/math/opencl/util.hpp>
+#include <test/unit/util.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+
+TEST(OpenCLRevGpDotProdCov, exceptions) {
+  Eigen::VectorXd a(3);
+  a << 1, 2, 3;
+  Eigen::VectorXd b(3);
+  b << -3, 4, 3;
+  Eigen::VectorXd c(3);
+  c << -3, 4, NAN;
+  std::vector<Eigen::VectorXd> x{a, a, b};
+  std::vector<Eigen::VectorXd> x_val{a, c, b};
+  stan::math::matrix_cl<double> x_cl(x);
+  stan::math::matrix_cl<double> x_val_cl(x_val);
+
+  stan::math::var sigma = 1.3;
+  stan::math::var sigma_val = -1.3;
+
+  EXPECT_NO_THROW(stan::math::gp_dot_prod_cov(x_cl, sigma));
+  EXPECT_THROW(stan::math::gp_dot_prod_cov(x_val_cl, sigma),
+               std::domain_error);
+  EXPECT_THROW(stan::math::gp_dot_prod_cov(x_cl, sigma_val),
+               std::domain_error);
+}
+
+auto gp_dot_prod_cov_functor = [](const auto& x, const auto sigma) {
+  return stan::math::gp_dot_prod_cov(x, sigma);
+};
+auto gp_dot_prod_cov_functor2 = [](const auto& x, const auto& y, const auto sigma) {
+  return stan::math::gp_dot_prod_cov(x, y, sigma);
+};
+
+TEST(OpenCLRevGpDotProdCov, small) {
+  Eigen::VectorXd a(3);
+  a << 1, 2, 3;
+  Eigen::VectorXd b(3);
+  b << -3, 4, -1;
+  Eigen::VectorXd c(3);
+  c << 4, -5, 3;
+  Eigen::VectorXd d(3);
+  d << -4, 5, 5;
+  std::vector<Eigen::VectorXd> x{a, b};
+  std::vector<Eigen::VectorXd> y{b, c, d, d};
+
+  double sigma = 1.3;
+
+  stan::math::test::compare_cpu_opencl_prim_rev(gp_dot_prod_cov_functor, x,
+                                                sigma);
+  stan::math::test::compare_cpu_opencl_prim_rev(gp_dot_prod_cov_functor2, x, y,
+                                                sigma);
+}
+
+TEST(OpenCLRevGpDotProdCov, large) {
+  int N1 = 67;
+  int N2 = 73;
+  std::vector<Eigen::VectorXd> x;
+  std::vector<Eigen::VectorXd> y;
+  for (int i = 0; i < N1; i++) {
+    x.push_back(Eigen::VectorXd::Random(N1));
+  }
+  for (int i = 0; i < N2; i++) {
+    y.push_back(Eigen::VectorXd::Random(N1));
+  }
+
+  double sigma = 1.3;
+
+  stan::math::test::compare_cpu_opencl_prim_rev(gp_dot_prod_cov_functor, x,
+                                                sigma);
+  stan::math::test::compare_cpu_opencl_prim_rev(gp_dot_prod_cov_functor2, x, y,
+                                                sigma);
+}
+
+#endif

From 2a92d6ab4956f0200c04410759b9f4d009cafb08 Mon Sep 17 00:00:00 2001
From: Stan Jenkins <mc.stanislaw@gmail.com>
Date: Thu, 15 Apr 2021 12:29:58 +0000
Subject: [PATCH 2/2] [Jenkins] auto-formatting by clang-format version
 6.0.0-1ubuntu2~16.04.1 (tags/RELEASE_600/final)

---
 .../opencl/kernels/gp_exponential_cov.hpp     | 21 +++++++++----------
 stan/math/opencl/prim/gp_dot_prod_cov.hpp     |  6 ++----
 stan/math/opencl/prim/gp_exponential_cov.hpp  | 14 ++++++-------
 stan/math/opencl/prim/gp_matern32_cov.hpp     |  4 ++--
 .../opencl/rev/gp_dot_product_cov_test.cpp    | 13 ++++++------
 5 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/stan/math/opencl/kernels/gp_exponential_cov.hpp b/stan/math/opencl/kernels/gp_exponential_cov.hpp
index c1a836dec05..73afa4e8922 100644
--- a/stan/math/opencl/kernels/gp_exponential_cov.hpp
+++ b/stan/math/opencl/kernels/gp_exponential_cov.hpp
@@ -21,10 +21,9 @@ static const std::string gp_exponential_cov_kernel_code = STRINGIFY(
      * @param size number of elements in x
      * @param element_size the number of doubles that make one element of x
      */
-    __kernel void gp_exponential_cov(const __global double* x,
-                                  __global double* res, const double sigma_sq,
-                                  const double neg_inv_l,
-                                  const int size, const int element_size) {
+    __kernel void gp_exponential_cov(
+        const __global double* x, __global double* res, const double sigma_sq,
+        const double neg_inv_l, const int size, const int element_size) {
       const int i = get_global_id(0);
       const int j = get_global_id(1);
       if (i < size && j < size) {
@@ -47,7 +46,8 @@ static const std::string gp_exponential_cov_kernel_code = STRINGIFY(
 // \endcond
 
 /** \ingroup opencl_kernels
- * See the docs for \link kernels/gp_exponential_cov.hpp gp_exponential_cov() \endlink
+ * See the docs for \link kernels/gp_exponential_cov.hpp gp_exponential_cov()
+ * \endlink
  */
 const kernel_cl<in_buffer, out_buffer, double, double, int, int>
     gp_exponential_cov("gp_exponential_cov", {gp_exponential_cov_kernel_code});
@@ -73,9 +73,8 @@ static const std::string gp_exponential_cov_cross_kernel_code = STRINGIFY(
      */
     __kernel void gp_exponential_cov_cross(
         const __global double* x1, const __global double* x2,
-        __global double* res, const double sigma_sq,
-        const double neg_inv_l, const int size1, const int size2,
-        const int element_size) {
+        __global double* res, const double sigma_sq, const double neg_inv_l,
+        const int size1, const int size2, const int element_size) {
       const int i = get_global_id(0);
       const int j = get_global_id(1);
       if (i < size1 && j < size2) {
@@ -92,12 +91,12 @@ static const std::string gp_exponential_cov_cross_kernel_code = STRINGIFY(
 // \endcond
 
 /** \ingroup opencl_kernels
- * See the docs for \link kernels/gp_exponential_cov.hpp gp_exponential_cov_cross()
- * \endlink
+ * See the docs for \link kernels/gp_exponential_cov.hpp
+ * gp_exponential_cov_cross() \endlink
  */
 const kernel_cl<in_buffer, in_buffer, out_buffer, double, double, int, int, int>
     gp_exponential_cov_cross("gp_exponential_cov_cross",
-                          {gp_exponential_cov_cross_kernel_code});
+                             {gp_exponential_cov_cross_kernel_code});
 
 }  // namespace opencl_kernels
 }  // namespace math
diff --git a/stan/math/opencl/prim/gp_dot_prod_cov.hpp b/stan/math/opencl/prim/gp_dot_prod_cov.hpp
index 471665c06fc..5d7f4bc3f02 100644
--- a/stan/math/opencl/prim/gp_dot_prod_cov.hpp
+++ b/stan/math/opencl/prim/gp_dot_prod_cov.hpp
@@ -24,8 +24,7 @@ namespace math {
 template <typename T_x, typename T_sigma,
           require_all_prim_or_rev_kernel_expression_t<T_x>* = nullptr,
           require_stan_scalar_t<T_sigma>* = nullptr>
-inline auto gp_dot_prod_cov(const T_x& x,
-                               const T_sigma sigma) {
+inline auto gp_dot_prod_cov(const T_x& x, const T_sigma sigma) {
   const char* fun = "gp_dot_prod_cov(OpenCL)";
   check_nonnegative(fun, "sigma", sigma);
   check_finite(fun, "sigma", sigma);
@@ -48,8 +47,7 @@ inline auto gp_dot_prod_cov(const T_x& x,
 template <typename T_x, typename T_y, typename T_sigma,
           require_all_prim_or_rev_kernel_expression_t<T_x, T_y>* = nullptr,
           require_stan_scalar_t<T_sigma>* = nullptr>
-inline auto gp_dot_prod_cov(const T_x& x, const T_y& y,
-                               const T_sigma sigma) {
+inline auto gp_dot_prod_cov(const T_x& x, const T_y& y, const T_sigma sigma) {
   const char* fun = "gp_dot_prod_cov(OpenCL)";
   check_nonnegative(fun, "sigma", sigma);
   check_finite(fun, "sigma", sigma);
diff --git a/stan/math/opencl/prim/gp_exponential_cov.hpp b/stan/math/opencl/prim/gp_exponential_cov.hpp
index aa54f924a24..8407df1231e 100644
--- a/stan/math/opencl/prim/gp_exponential_cov.hpp
+++ b/stan/math/opencl/prim/gp_exponential_cov.hpp
@@ -34,10 +34,10 @@ inline matrix_cl<return_type_t<T1, T2, T3>> gp_exponential_cov(
   int n_blocks = (x.cols() + block_size - 1) / block_size;
   int blocked_size = block_size * n_blocks;
   try {
-    opencl_kernels::gp_exponential_cov(
-        cl::NDRange(blocked_size, blocked_size),
-        cl::NDRange(block_size, block_size), x_eval, res, sigma * sigma,
-        -1.0 / length_scale, x.cols(), x.rows());
+    opencl_kernels::gp_exponential_cov(cl::NDRange(blocked_size, blocked_size),
+                                       cl::NDRange(block_size, block_size),
+                                       x_eval, res, sigma * sigma,
+                                       -1.0 / length_scale, x.cols(), x.rows());
   } catch (const cl::Error& e) {
     check_opencl_error("gp_exponential_cov", e);
   }
@@ -110,9 +110,9 @@ inline matrix_cl<return_type_t<T1, T2, T3>> gp_exponential_cov(
   int blocked_size = block_size * n_blocks;
   try {
     opencl_kernels::gp_exponential_cov(cl::NDRange(blocked_size, blocked_size),
-                                    cl::NDRange(block_size, block_size), x_eval,
-                                    res, sigma * sigma, -1.0, x.cols(),
-                                    x.rows());
+                                       cl::NDRange(block_size, block_size),
+                                       x_eval, res, sigma * sigma, -1.0,
+                                       x.cols(), x.rows());
   } catch (const cl::Error& e) {
     check_opencl_error("gp_exponential_cov", e);
   }
diff --git a/stan/math/opencl/prim/gp_matern32_cov.hpp b/stan/math/opencl/prim/gp_matern32_cov.hpp
index 19afc632d26..11ae25a0b21 100644
--- a/stan/math/opencl/prim/gp_matern32_cov.hpp
+++ b/stan/math/opencl/prim/gp_matern32_cov.hpp
@@ -111,8 +111,8 @@ inline matrix_cl<return_type_t<T1, T2, T3>> gp_matern32_cov(
   try {
     opencl_kernels::gp_matern32_cov(cl::NDRange(blocked_size, blocked_size),
                                     cl::NDRange(block_size, block_size), x_eval,
-                                    res, sigma * sigma, std::sqrt(3.0), x.cols(),
-                                    x.rows());
+                                    res, sigma * sigma, std::sqrt(3.0),
+                                    x.cols(), x.rows());
   } catch (const cl::Error& e) {
     check_opencl_error("gp_matern32_cov", e);
   }
diff --git a/test/unit/math/opencl/rev/gp_dot_product_cov_test.cpp b/test/unit/math/opencl/rev/gp_dot_product_cov_test.cpp
index af9a42b231d..673012f2097 100644
--- a/test/unit/math/opencl/rev/gp_dot_product_cov_test.cpp
+++ b/test/unit/math/opencl/rev/gp_dot_product_cov_test.cpp
@@ -21,18 +21,17 @@ TEST(OpenCLRevGpDotProdCov, exceptions) {
   stan::math::var sigma_val = -1.3;
 
   EXPECT_NO_THROW(stan::math::gp_dot_prod_cov(x_cl, sigma));
-  EXPECT_THROW(stan::math::gp_dot_prod_cov(x_val_cl, sigma),
-               std::domain_error);
-  EXPECT_THROW(stan::math::gp_dot_prod_cov(x_cl, sigma_val),
-               std::domain_error);
+  EXPECT_THROW(stan::math::gp_dot_prod_cov(x_val_cl, sigma), std::domain_error);
+  EXPECT_THROW(stan::math::gp_dot_prod_cov(x_cl, sigma_val), std::domain_error);
 }
 
 auto gp_dot_prod_cov_functor = [](const auto& x, const auto sigma) {
   return stan::math::gp_dot_prod_cov(x, sigma);
 };
-auto gp_dot_prod_cov_functor2 = [](const auto& x, const auto& y, const auto sigma) {
-  return stan::math::gp_dot_prod_cov(x, y, sigma);
-};
+auto gp_dot_prod_cov_functor2
+    = [](const auto& x, const auto& y, const auto sigma) {
+        return stan::math::gp_dot_prod_cov(x, y, sigma);
+      };
 
 TEST(OpenCLRevGpDotProdCov, small) {
   Eigen::VectorXd a(3);