diff --git a/Jenkinsfile b/Jenkinsfile
index cb3217f4025..163e8def66a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -150,7 +150,7 @@ pipeline {
         stage('Verify changes') {
             agent { label 'linux' }
             steps {
-                script {         
+                script {
 
                     retry(3) { checkout scm }
                     sh 'git clean -xffd'
@@ -240,6 +240,7 @@ pipeline {
                     steps {
                         deleteDir()
                         unstash 'MathSetup'
+                        sh "export STAN_NUM_THREADS=2"
                         sh "echo CXX=${env.CXX} -Werror > make/local"
                         sh "echo CPPFLAGS+=-DSTAN_THREADS >> make/local"
                         runTests("test/unit -f thread")
@@ -263,6 +264,7 @@ pipeline {
                     steps {
                         deleteDirWin()
                         unstash 'MathSetup'
+                        bat "setx STAN_NUM_THREADS 2"
                         bat "echo CXX=${env.CXX} -Werror > make/local"
                         bat "echo CXXFLAGS+=-DSTAN_THREADS >> make/local"
                         runTestsWin("test/unit -f thread")
@@ -272,7 +274,7 @@ pipeline {
             }
         }
         stage('Additional merge tests') {
-            when { 
+            when {
                 allOf {
                     anyOf {
                         branch 'develop'
@@ -309,12 +311,12 @@ pipeline {
             }
         }
         stage('Upstream tests') {
-            when { 
+            when {
                 allOf {
-                    expression { 
-                        env.BRANCH_NAME ==~ /PR-\d+/ 
+                    expression {
+                        env.BRANCH_NAME ==~ /PR-\d+/
                     }
-                    expression { 
+                    expression {
                         !skipRemainingStages
                     }
                 }
diff --git a/make/compiler_flags b/make/compiler_flags
index 61adb29ddab..d2b13b975f4 100644
--- a/make/compiler_flags
+++ b/make/compiler_flags
@@ -271,7 +271,7 @@ print-compiler-flags:
 	@echo '  - SUNDIALS                    ' $(SUNDIALS)
 	@echo '  - TBB                         ' $(TBB)
 	@echo '  - GTEST                       ' $(GTEST)
-	@echo '  - STAN_THREADS                ' $(STAN_THREADS) 
+	@echo '  - STAN_THREADS                ' $(STAN_THREADS)
 	@echo '  - STAN_OPENCL                 ' $(STAN_OPENCL)
 	@echo '  - STAN_MPI                    ' $(STAN_MPI)
 	@echo '  Compiler flags (each can be overriden separately):'
diff --git a/stan/math/prim/functor.hpp b/stan/math/prim/functor.hpp
index bfa72de41af..a029e13f4bb 100644
--- a/stan/math/prim/functor.hpp
+++ b/stan/math/prim/functor.hpp
@@ -18,5 +18,7 @@
 #include <stan/math/prim/functor/mpi_cluster.hpp>
 #include <stan/math/prim/functor/mpi_command.hpp>
 #include <stan/math/prim/functor/mpi_distributed_apply.hpp>
+#include <stan/math/prim/functor/reduce_sum.hpp>
+#include <stan/math/prim/functor/reduce_sum_static.hpp>
 
 #endif
diff --git a/stan/math/prim/functor/reduce_sum.hpp b/stan/math/prim/functor/reduce_sum.hpp
new file mode 100644
index 00000000000..25dc700b8eb
--- /dev/null
+++ b/stan/math/prim/functor/reduce_sum.hpp
@@ -0,0 +1,272 @@
+#ifndef STAN_MATH_PRIM_FUNCTOR_REDUCE_SUM_HPP
+#define STAN_MATH_PRIM_FUNCTOR_REDUCE_SUM_HPP
+
+#include <stan/math/prim/meta.hpp>
+
+#include <tbb/task_arena.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/blocked_range.h>
+
+#include <tuple>
+#include <vector>
+
+namespace stan {
+namespace math {
+
+namespace internal {
+
+/**
+ * reduce_sum_impl implementation for any autodiff type.
+ *
+ * @tparam ReduceFunction Type of reducer function
+ * @tparam ReturnType An arithmetic type
+ * @tparam Vec Type of sliced argument
+ * @tparam Args Types of shared arguments
+ */
+template <typename ReduceFunction, typename Enable, typename ReturnType,
+          typename Vec, typename... Args>
+struct reduce_sum_impl {
+  /**
+   * Call an instance of the function `ReduceFunction` on every element
+   *   of an input sequence and sum these terms.
+   *
+   * This specialization is not parallelized and works for any autodiff types.
+   *
+   * An instance, f, of `ReduceFunction` should have the signature:
+   *   T f(int start, int end, Vec&& vmapped_subset, std::ostream* msgs,
+   * Args&&... args)
+   *
+   * `ReduceFunction` must be default constructible without any arguments
+   *
+   * Each call to `ReduceFunction` is responsible for computing the
+   *   start through end terms (inclusive) of the overall sum. All args are
+   * passed from this function through to the `ReduceFunction` instances.
+   *   However, only the start through end  (inclusive) elements of the vmapped
+   * argument are passed to the `ReduceFunction` instances (as the
+   * `vmapped_subset` argument).
+   *
+   * If auto partitioning is true, do the calculation with one
+   *  ReduceFunction call. If false, break work into pieces strictly smaller
+   *  than grainsize.
+   *
+   * grainsize must be greater than or equal to 1
+   *
+   * @param vmapped Sliced arguments used only in some sum terms
+   * @param auto_partitioning Work partitioning style (ignored)
+   * @param grainsize Suggested grainsize for tbb
+   * @param[in, out] msgs The print stream for warning messages
+   * @param args Shared arguments used in every sum term
+   * @return Summation of all terms
+   */
+  return_type_t<Vec, Args...> operator()(Vec&& vmapped, bool auto_partitioning,
+                                         int grainsize, std::ostream* msgs,
+                                         Args&&... args) const {
+    const std::size_t num_jobs = vmapped.size();
+
+    if (num_jobs == 0) {
+      return 0.0;
+    }
+
+    if (auto_partitioning) {
+      return ReduceFunction()(0, vmapped.size() - 1, std::forward<Vec>(vmapped),
+                              msgs, std::forward<Args>(args)...);
+    } else {
+      return_type_t<Vec, Args...> sum = 0.0;
+      for (size_t i = 0; i < (vmapped.size() + grainsize - 1) / grainsize;
+           ++i) {
+        size_t start = i * grainsize;
+        size_t end = std::min((i + 1) * grainsize, vmapped.size()) - 1;
+
+        std::decay_t<Vec> sub_slice;
+        sub_slice.reserve(end - start + 1);
+        for (int i = start; i <= end; ++i) {
+          sub_slice.emplace_back(vmapped[i]);
+        }
+
+        sum += ReduceFunction()(start, end, std::forward<Vec>(sub_slice), msgs,
+                                std::forward<Args>(args)...);
+      }
+      return sum;
+    }
+  }
+};
+
+/**
+ * Specialization of reduce_sum_impl for arithmetic types
+ *
+ * @tparam ReduceFunction Type of reducer function
+ * @tparam ReturnType An arithmetic type
+ * @tparam Vec Type of sliced argument
+ * @tparam Args Types of shared arguments
+ */
+template <typename ReduceFunction, typename ReturnType, typename Vec,
+          typename... Args>
+struct reduce_sum_impl<ReduceFunction, require_arithmetic_t<ReturnType>,
+                       ReturnType, Vec, Args...> {
+  /**
+   * Internal object meeting the Imperative form requirements of
+   * `tbb::parallel_reduce`
+   *
+   * @note see link [here](https://tinyurl.com/vp7xw2t) for requirements.
+   */
+  struct recursive_reducer {
+    Vec vmapped_;
+    std::ostream* msgs_;
+    std::tuple<Args...> args_tuple_;
+    return_type_t<Vec, Args...> sum_{0.0};
+
+    recursive_reducer(Vec&& vmapped, std::ostream* msgs, Args&&... args)
+        : vmapped_(std::forward<Vec>(vmapped)),
+          msgs_(msgs),
+          args_tuple_(std::forward<Args>(args)...) {}
+
+    /*
+     * This is the copy operator as required for tbb::parallel_reduce
+     *   Imperative form. This requires sum_ be reset to zero.
+     */
+    recursive_reducer(recursive_reducer& other, tbb::split)
+        : vmapped_(other.vmapped_),
+          msgs_(other.msgs_),
+          args_tuple_(other.args_tuple_) {}
+
+    /**
+     * Compute the value and of `ReduceFunction` over the range defined by r
+     *   and accumulate those in member variable sum_. This function may
+     *   be called multiple times per object instantiation (so the sum_
+     *   must be accumulated, not just assigned).
+     *
+     * @param r Range over which to compute `ReduceFunction`
+     */
+    void operator()(const tbb::blocked_range<size_t>& r) {
+      if (r.empty()) {
+        return;
+      }
+
+      std::decay_t<Vec> sub_slice;
+      sub_slice.reserve(r.size());
+      for (int i = r.begin(); i < r.end(); ++i) {
+        sub_slice.emplace_back(vmapped_[i]);
+      }
+
+      sum_ += apply(
+          [&](auto&&... args) {
+            return ReduceFunction()(r.begin(), r.end() - 1, sub_slice, msgs_,
+                                    args...);
+          },
+          this->args_tuple_);
+    }
+
+    /**
+     * Join reducers. Accumuluate the value (sum_) of the other reducer.
+     *
+     * @param rhs Another partial sum
+     */
+    void join(const recursive_reducer& child) { this->sum_ += child.sum_; }
+  };
+
+  /**
+   * Call an instance of the function `ReduceFunction` on every element
+   *   of an input sequence and sum these terms.
+   *
+   * This specialization is parallelized using tbb and works only for
+   *   arithmetic types.
+   *
+   * An instance, f, of `ReduceFunction` should have the signature:
+   *   double f(int start, int end, Vec&& vmapped_subset, std::ostream* msgs,
+   * Args&&... args)
+   *
+   * `ReduceFunction` must be default constructible without any arguments
+   *
+   * Each call to `ReduceFunction` is responsible for computing the
+   *   start through end (inclusive) terms of the overall sum. All args are
+   * passed from this function through to the `ReduceFunction` instances.
+   *   However, only the start through end (inclusive) elements of the vmapped
+   * argument are passed to the `ReduceFunction` instances (as the
+   * `vmapped_subset` argument).
+   *
+   * This function distributes computation of the desired sum
+   *   over multiple threads by coordinating calls to `ReduceFunction`
+   * instances.
+   *
+   * If auto partitioning is true, break work into pieces automatically,
+   *  taking grainsize as a recommended work size (this process
+   *  is not deterministic). If false, break work deterministically
+   *  into pieces smaller than or equal to grainsize. The execution
+   *  order is non-deterministic.
+   *
+   * grainsize must be greater than or equal to 1
+   *
+   * @param vmapped Sliced arguments used only in some sum terms
+   * @param auto_partitioning Work partitioning style
+   * @param grainsize Suggested grainsize for tbb
+   * @param[in, out] msgs The print stream for warning messages
+   * @param args Shared arguments used in every sum term
+   * @return Summation of all terms
+   */
+  ReturnType operator()(Vec&& vmapped, bool auto_partitioning, int grainsize,
+                        std::ostream* msgs, Args&&... args) const {
+    const std::size_t num_jobs = vmapped.size();
+    if (num_jobs == 0) {
+      return 0.0;
+    }
+    recursive_reducer worker(std::forward<Vec>(vmapped), msgs,
+                             std::forward<Args>(args)...);
+
+    if (auto_partitioning) {
+      tbb::parallel_reduce(
+          tbb::blocked_range<std::size_t>(0, num_jobs, grainsize), worker);
+    } else {
+      tbb::simple_partitioner partitioner;
+      tbb::parallel_deterministic_reduce(
+          tbb::blocked_range<std::size_t>(0, num_jobs, grainsize), worker,
+          partitioner);
+    }
+
+    return worker.sum_;
+  }
+};
+
+}  // namespace internal
+
+/**
+ * Call an instance of the function `ReduceFunction` on every element
+ *   of an input sequence and sum these terms.
+ *
+ * This defers to reduce_sum_impl for the appropriate implementation
+ *
+ * An instance, f, of `ReduceFunction` should have the signature:
+ *   T f(int start, int end, Vec&& vmapped_subset, std::ostream* msgs, Args&&...
+ * args)
+ *
+ * `ReduceFunction` must be default constructible without any arguments
+ *
+ * grainsize must be greater than or equal to 1
+ *
+ * @tparam ReduceFunction Type of reducer function
+ * @tparam ReturnType An arithmetic type
+ * @tparam Vec Type of sliced argument
+ * @tparam Args Types of shared arguments
+ * @param vmapped Sliced arguments used only in some sum terms
+ * @param grainsize Suggested grainsize for tbb
+ * @param[in, out] msgs The print stream for warning messages
+ * @param args Shared arguments used in every sum term
+ * @return Sum of terms
+ */
+template <typename ReduceFunction, typename Vec,
+          typename = require_vector_like_t<Vec>, typename... Args>
+auto reduce_sum(Vec&& vmapped, int grainsize, std::ostream* msgs,
+                Args&&... args) {
+  using return_type = return_type_t<Vec, Args...>;
+
+  check_positive("reduce_sum", "grainsize", grainsize);
+
+  return internal::reduce_sum_impl<ReduceFunction, void, return_type, Vec,
+                                   Args...>()(std::forward<Vec>(vmapped), true,
+                                              grainsize, msgs,
+                                              std::forward<Args>(args)...);
+}
+
+}  // namespace math
+}  // namespace stan
+
+#endif
diff --git a/stan/math/prim/functor/reduce_sum_static.hpp b/stan/math/prim/functor/reduce_sum_static.hpp
new file mode 100644
index 00000000000..5b13701df93
--- /dev/null
+++ b/stan/math/prim/functor/reduce_sum_static.hpp
@@ -0,0 +1,57 @@
+#ifndef STAN_MATH_PRIM_FUNCTOR_REDUCE_SUM_STATIC_HPP
+#define STAN_MATH_PRIM_FUNCTOR_REDUCE_SUM_STATIC_HPP
+
+#include <stan/math/prim/meta.hpp>
+
+#include <tbb/task_arena.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/blocked_range.h>
+
+#include <tuple>
+#include <vector>
+
+namespace stan {
+namespace math {
+
+/**
+ * Call an instance of the function `ReduceFunction` on every element
+ *   of an input sequence and sum these terms.
+ *
+ * This defers to reduce_sum_impl for the appropriate implementation
+ *
+ * An instance, f, of `ReduceFunction` should have the signature:
+ *   T f(int start, int end, Vec&& vmapped_subset, std::ostream* msgs, Args&&...
+ * args)
+ *
+ * `ReduceFunction` must be default constructible without any arguments
+ *
+ * grainsize must be greater than or equal to 1
+ *
+ * @tparam ReduceFunction Type of reducer function
+ * @tparam ReturnType An arithmetic type
+ * @tparam Vec Type of sliced argument
+ * @tparam Args Types of shared arguments
+ * @param vmapped Sliced arguments used only in some sum terms
+ * @param grainsize Suggested grainsize for tbb
+ * @param[in, out] msgs The print stream for warning messages
+ * @param args Shared arguments used in every sum term
+ * @return Sum of terms
+ */
+template <typename ReduceFunction, typename Vec,
+          typename = require_vector_like_t<Vec>, typename... Args>
+auto reduce_sum_static(Vec&& vmapped, int grainsize, std::ostream* msgs,
+                       Args&&... args) {
+  using return_type = return_type_t<Vec, Args...>;
+
+  check_positive("reduce_sum", "grainsize", grainsize);
+
+  return internal::reduce_sum_impl<ReduceFunction, void, return_type, Vec,
+                                   Args...>()(std::forward<Vec>(vmapped), false,
+                                              grainsize, msgs,
+                                              std::forward<Args>(args)...);
+}
+
+}  // namespace math
+}  // namespace stan
+
+#endif
diff --git a/stan/math/prim/meta/broadcast_array.hpp b/stan/math/prim/meta/broadcast_array.hpp
index 3c92f4e139b..41c44b7d954 100644
--- a/stan/math/prim/meta/broadcast_array.hpp
+++ b/stan/math/prim/meta/broadcast_array.hpp
@@ -13,12 +13,14 @@ namespace internal {
 template <typename T>
 class broadcast_array {
  private:
-  T& prim_;
+  using T_decay = std::decay_t<T>;
+  T_decay& prim_;
 
  public:
   explicit broadcast_array(T& prim) : prim_(prim) {}
 
-  T& operator[](int /*i*/) { return prim_; }
+  auto& operator[](int /*i*/) { return prim_; }
+  auto& operator[](int /*i*/) const { return prim_; }
 
   /** \ingroup type_trait
    * We can assign any right hand side which allows for indexing to a
@@ -30,22 +32,33 @@ class broadcast_array {
   void operator=(const Y& m) {
     prim_ = m[0];
   }
+  broadcast_array& operator=(const broadcast_array& other) = default;
+  broadcast_array& operator=(broadcast_array&& other) = default;
 };
 
 template <typename T, typename S, typename Enable = void>
 class empty_broadcast_array {
+  using T_decay = std::decay_t<T>;
+
  public:
   empty_broadcast_array() {}
   /** \ingroup type_trait
    * Not implemented so cannot be called.
    */
-  T& operator[](int /*i*/);
+  T_decay& operator[](int /*i*/);
+  T_decay& operator[](int /*i*/) const;
 
-  /** \ingroup type_trait
+  /**
    * Not implemented so cannot be called.
    */
   template <typename Y>
   void operator=(const Y& /*A*/);
+
+  /**
+   * Not implemented so cannot be called.
+   */
+  template <typename R>
+  void operator+=(R);
 };
 
 template <typename ViewElt, typename T>
@@ -55,31 +68,31 @@ class empty_broadcast_array<ViewElt, T, require_eigen_t<T>> {
 
  public:
   empty_broadcast_array() {}
-  /** \ingroup type_trait
+  /**
    * Not implemented so cannot be called.
    */
   ViewElt& operator[](int /*i*/);
-  /** \ingroup type_trait
+  /**
    * Not implemented so cannot be called.
    */
   ViewElt& operator()(int /*i*/);
-  /** \ingroup type_trait
+  /**
    * Not implemented so cannot be called.
    */
   void operator=(const T_arg& /*A*/);
-  /** \ingroup type_trait
+  /**
    * Not implemented so cannot be called.
    */
   void operator+=(T_arg /*A*/);
-  /** \ingroup type_trait
+  /**
    * Not implemented so cannot be called.
    */
   void operator-=(T_arg /*A*/);
-  /** \ingroup type_trait
+  /**
    * Not implemented so cannot be called.
    */
   T& row(int /*i*/);
-  /** \ingroup type_trait
+  /**
    * Not implemented so cannot be called.
    */
   T& col(int /*i*/);
diff --git a/stan/math/prim/meta/operands_and_partials.hpp b/stan/math/prim/meta/operands_and_partials.hpp
index 19d4d0febab..b553b9ee0a6 100644
--- a/stan/math/prim/meta/operands_and_partials.hpp
+++ b/stan/math/prim/meta/operands_and_partials.hpp
@@ -2,7 +2,6 @@
 #define STAN_MATH_PRIM_META_OPERANDS_AND_PARTIALS_HPP
 
 #include <stan/math/prim/fun/Eigen.hpp>
-#include <stan/math/prim/meta/require_generics.hpp>
 #include <stan/math/prim/meta/broadcast_array.hpp>
 #include <stan/math/prim/meta/return_type.hpp>
 #include <vector>
diff --git a/stan/math/rev/core.hpp b/stan/math/rev/core.hpp
index 33eb934c08e..b7f1aeed314 100644
--- a/stan/math/rev/core.hpp
+++ b/stan/math/rev/core.hpp
@@ -9,6 +9,7 @@
 #include <stan/math/rev/core/count_vars.hpp>
 #include <stan/math/rev/core/init_chainablestack.hpp>
 #include <stan/math/rev/core/std_iterator_traits.hpp>
+#include <stan/math/rev/core/deep_copy_vars.hpp>
 #include <stan/math/rev/core/ddv_vari.hpp>
 #include <stan/math/rev/core/deep_copy_vars.hpp>
 #include <stan/math/rev/core/dv_vari.hpp>
@@ -48,6 +49,7 @@
 #include <stan/math/rev/core/print_stack.hpp>
 #include <stan/math/rev/core/recover_memory.hpp>
 #include <stan/math/rev/core/recover_memory_nested.hpp>
+#include <stan/math/rev/core/save_varis.hpp>
 #include <stan/math/rev/core/set_zero_all_adjoints.hpp>
 #include <stan/math/rev/core/set_zero_all_adjoints_nested.hpp>
 #include <stan/math/rev/core/start_nested.hpp>
diff --git a/stan/math/rev/functor.hpp b/stan/math/rev/functor.hpp
index 0af65fe88e5..13566d8b8bc 100644
--- a/stan/math/rev/functor.hpp
+++ b/stan/math/rev/functor.hpp
@@ -20,5 +20,6 @@
 #include <stan/math/rev/functor/kinsol_solve.hpp>
 #include <stan/math/rev/functor/map_rect_concurrent.hpp>
 #include <stan/math/rev/functor/map_rect_reduce.hpp>
+#include <stan/math/rev/functor/reduce_sum.hpp>
 
 #endif
diff --git a/stan/math/rev/functor/reduce_sum.hpp b/stan/math/rev/functor/reduce_sum.hpp
new file mode 100644
index 00000000000..f6ba97a7e42
--- /dev/null
+++ b/stan/math/rev/functor/reduce_sum.hpp
@@ -0,0 +1,243 @@
+#ifndef STAN_MATH_REV_SCAL_FUNCTOR_REDUCE_SUM_HPP
+#define STAN_MATH_REV_SCAL_FUNCTOR_REDUCE_SUM_HPP
+
+#include <stan/math/prim/meta.hpp>
+#include <stan/math/rev/core/deep_copy_vars.hpp>
+#include <stan/math/rev/core/accumulate_adjoints.hpp>
+#include <stan/math/rev/core/count_vars.hpp>
+#include <stan/math/rev/core/save_varis.hpp>
+#include <stan/math/rev/core.hpp>
+#include <tbb/task_arena.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/blocked_range.h>
+
+#include <tuple>
+#include <vector>
+
+namespace stan {
+namespace math {
+namespace internal {
+
+/**
+ * Var specialization of reduce_sum_impl
+ *
+ * @tparam ReduceFunction Type of reducer function
+ * @tparam ReturnType Must be var
+ * @tparam Vec Type of sliced argument
+ * @tparam Args Types of shared arguments
+ */
+template <typename ReduceFunction, typename ReturnType, typename Vec,
+          typename... Args>
+struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
+                       Vec, Args...> {
+  /**
+   * Internal object meeting the Imperative form requirements of
+   * `tbb::parallel_reduce`
+   *
+   * @note see link [here](https://tinyurl.com/vp7xw2t) for requirements.
+   */
+  struct recursive_reducer {
+    size_t per_job_sliced_terms_;
+    size_t num_shared_terms_;  // Number of terms shared across threads
+    double* sliced_partials_;  // Points to adjoints of the partial calculations
+    Vec vmapped_;
+    std::ostream* msgs_;
+    std::tuple<Args...> args_tuple_;
+    double sum_{0.0};
+    Eigen::VectorXd args_adjoints_{0};
+
+    template <typename VecT, typename... ArgsT>
+    recursive_reducer(size_t per_job_sliced_terms, size_t num_shared_terms,
+                      double* sliced_partials, VecT&& vmapped,
+                      std::ostream* msgs, ArgsT&&... args)
+        : per_job_sliced_terms_(per_job_sliced_terms),
+          num_shared_terms_(num_shared_terms),
+          sliced_partials_(sliced_partials),
+          vmapped_(std::forward<VecT>(vmapped)),
+          msgs_(msgs),
+          args_tuple_(std::forward<ArgsT>(args)...) {}
+
+    /*
+     * This is the copy operator as required for tbb::parallel_reduce
+     *   Imperative form. This requires the reduced values (sum_ and
+     * arg_adjoints_) be reset to zero.
+     */
+    recursive_reducer(recursive_reducer& other, tbb::split)
+        : per_job_sliced_terms_(other.per_job_sliced_terms_),
+          num_shared_terms_(other.num_shared_terms_),
+          sliced_partials_(other.sliced_partials_),
+          vmapped_(other.vmapped_),
+          msgs_(other.msgs_),
+          args_tuple_(other.args_tuple_) {}
+
+    /**
+     * Compute, using nested autodiff, the value and Jacobian of
+     * `ReduceFunction` called over the range defined by r and accumulate those
+     * in member variable sum_ (for the value) and args_adjoints_ (for the
+     * Jacobian). This function may be called multiple times per object
+     * instantiation (so the sum_ and args_adjoints_ must be accumulated, not
+     * just assigned).
+     *
+     * @param r Range over which to compute reduce_sum
+     */
+    inline void operator()(const tbb::blocked_range<size_t>& r) {
+      if (r.empty()) {
+        return;
+      }
+
+      if (args_adjoints_.size() == 0) {
+        args_adjoints_ = Eigen::VectorXd::Zero(this->num_shared_terms_);
+      }
+
+      // Initialize nested autodiff stack
+      const nested_rev_autodiff begin_nest;
+
+      // Create nested autodiff copies of sliced argument that do not point
+      //   back to main autodiff stack
+      std::decay_t<Vec> local_sub_slice;
+      local_sub_slice.reserve(r.size());
+      for (int i = r.begin(); i < r.end(); ++i) {
+        local_sub_slice.emplace_back(deep_copy_vars(vmapped_[i]));
+      }
+
+      // Create nested autodiff copies of all shared arguments that do not point
+      //   back to main autodiff stack
+      auto args_tuple_local_copy = apply(
+          [&](auto&&... args) {
+            return std::tuple<decltype(deep_copy_vars(args))...>(
+                deep_copy_vars(args)...);
+          },
+          this->args_tuple_);
+
+      // Perform calculation
+      var sub_sum_v = apply(
+          [&](auto&&... args) {
+            return ReduceFunction()(r.begin(), r.end() - 1, local_sub_slice,
+                                    this->msgs_, args...);
+          },
+          args_tuple_local_copy);
+
+      // Compute Jacobian
+      sub_sum_v.grad();
+
+      // Accumulate value of reduce_sum
+      sum_ += sub_sum_v.val();
+
+      // Accumulate adjoints of sliced_arguments
+      accumulate_adjoints(
+          this->sliced_partials_ + r.begin() * per_job_sliced_terms_,
+          local_sub_slice);
+
+      // Accumulate adjoints of shared_arguments
+      apply(
+          [&](auto&&... args) {
+            accumulate_adjoints(args_adjoints_.data(),
+                                std::forward<decltype(args)>(args)...);
+          },
+          std::move(args_tuple_local_copy));
+    }
+
+    /**
+     * Join reducers. Accumuluate the value (sum_) and Jacobian (arg_adoints_)
+     *   of the other reducer.
+     *
+     * @param rhs Another partial sum
+     */
+    void join(const recursive_reducer& rhs) {
+      this->sum_ += rhs.sum_;
+      if (this->args_adjoints_.size() != 0 && rhs.args_adjoints_.size() != 0) {
+        this->args_adjoints_ += rhs.args_adjoints_;
+      } else if (this->args_adjoints_.size() == 0
+                 && rhs.args_adjoints_.size() != 0) {
+        this->args_adjoints_ = rhs.args_adjoints_;
+      }
+    }
+  };
+
+  /**
+   * Call an instance of the function `ReduceFunction` on every element
+   *   of an input sequence and sum these terms.
+   *
+   * This specialization is parallelized using tbb and works for reverse
+   *   mode autodiff.
+   *
+   * An instance, f, of `ReduceFunction` should have the signature:
+   *   var f(int start, int end, Vec&& vmapped_subset, std::ostream* msgs,
+   * Args&&... args)
+   *
+   * `ReduceFunction` must be default constructible without any arguments
+   *
+   * Each call to `ReduceFunction` is responsible for computing the
+   *   start through end (inclusive) terms of the overall sum. All args are
+   * passed from this function through to the `ReduceFunction` instances.
+   *   However, only the start through end (inclusive) elements of the vmapped
+   * argument are passed to the `ReduceFunction` instances (as the
+   * `vmapped_subset` argument).
+   *
+   * This function distributes computation of the desired sum and the Jacobian
+   * of that sum over multiple threads by coordinating calls to `ReduceFunction`
+   * instances. Results are stored as precomputed varis in the autodiff tree.
+   *
+   * If auto partitioning is true, break work into pieces automatically,
+   *  taking grainsize as a recommended work size (this process
+   *  is not deterministic). If false, break work deterministically
+   *  into pieces smaller than or equal to grainsize. The execution
+   *  order is non-deterministic.
+   *
+   * @param vmapped Sliced arguments used only in some sum terms
+   * @param auto_partitioning Work partitioning style
+   * @param grainsize Suggested grainsize for tbb
+   * @param[in, out] msgs The print stream for warning messages
+   * @param args Shared arguments used in every sum term
+   * @return Summation of all terms
+   */
+  inline var operator()(Vec&& vmapped, bool auto_partitioning, int grainsize,
+                        std::ostream* msgs, Args&&... args) const {
+    const std::size_t num_jobs = vmapped.size();
+
+    if (num_jobs == 0) {
+      return var(0.0);
+    }
+
+    const std::size_t per_job_sliced_terms = count_vars(vmapped[0]);
+    const std::size_t num_sliced_terms = num_jobs * per_job_sliced_terms;
+    const std::size_t num_shared_terms = count_vars(args...);
+
+    vari** varis = ChainableStack::instance_->memalloc_.alloc_array<vari*>(
+        num_sliced_terms + num_shared_terms);
+    double* partials = ChainableStack::instance_->memalloc_.alloc_array<double>(
+        num_sliced_terms + num_shared_terms);
+
+    for (size_t i = 0; i < num_sliced_terms; ++i) {
+      partials[i] = 0.0;
+    }
+    recursive_reducer worker(per_job_sliced_terms, num_shared_terms, partials,
+                             vmapped, msgs, args...);
+
+    if (auto_partitioning) {
+      tbb::parallel_reduce(
+          tbb::blocked_range<std::size_t>(0, num_jobs, grainsize), worker);
+    } else {
+      tbb::simple_partitioner partitioner;
+      tbb::parallel_deterministic_reduce(
+          tbb::blocked_range<std::size_t>(0, num_jobs, grainsize), worker,
+          partitioner);
+    }
+
+    save_varis(varis, std::forward<Vec>(vmapped));
+    save_varis(varis + num_sliced_terms, std::forward<Args>(args)...);
+
+    for (size_t i = 0; i < num_shared_terms; ++i) {
+      partials[num_sliced_terms + i] = worker.args_adjoints_(i);
+    }
+
+    return var(new precomputed_gradients_vari(
+        worker.sum_, num_sliced_terms + num_shared_terms, varis, partials));
+  }
+};
+}  // namespace internal
+
+}  // namespace math
+}  // namespace stan
+
+#endif
diff --git a/test/unit/math/mix/functor/reduce_sum_static_test.cpp b/test/unit/math/mix/functor/reduce_sum_static_test.cpp
new file mode 100644
index 00000000000..844fdc16ebb
--- /dev/null
+++ b/test/unit/math/mix/functor/reduce_sum_static_test.cpp
@@ -0,0 +1,457 @@
+#include <stan/math/prim/meta.hpp>
+#include <test/unit/math/test_ad.hpp>
+
+#include <limits>
+#include <vector>
+
+std::ostream* msgs = nullptr;
+
+template <typename T, stan::require_stan_scalar_t<T>* = nullptr>
+T sum_(T arg) {
+  return arg;
+}
+
+template <typename EigMat, stan::require_eigen_t<EigMat>* = nullptr>
+auto sum_(EigMat&& arg) {
+  return stan::math::sum(arg);
+}
+
+template <typename Vec, stan::require_std_vector_t<Vec>* = nullptr>
+auto sum_(Vec&& arg) {
+  stan::scalar_type_t<Vec> sum = 0;
+  for (size_t i = 0; i < arg.size(); ++i) {
+    sum += sum_(arg[i]);
+  }
+  return sum;
+}
+
+struct sum_lpdf {
+  template <typename T, typename... Args>
+  inline auto operator()(std::size_t start, std::size_t end, T&& sub_slice,
+                         std::ostream* msgs, Args&&... args) const {
+    using return_type = stan::return_type_t<T, Args...>;
+
+    return sum_(sub_slice)
+           + sub_slice.size()
+                 * stan::math::sum(std::vector<return_type>{
+                       return_type(sum_(std::forward<Args>(args)))...});
+  }
+};
+
+TEST(MathMix_reduce_sum_static, grainsize) {
+  auto f1 = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 0, msgs);
+  };
+  auto f2 = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, -1, msgs);
+  };
+  auto f3 = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+  auto f4 = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 20, msgs);
+  };
+
+  std::vector<double> data(5, 10.0);
+
+  stan::test::expect_ad(f1, data);
+  stan::test::expect_ad(f2, data);
+  stan::test::expect_ad(f3, data);
+  stan::test::expect_ad(f4, data);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<double> data(5, 10.0);
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_std_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<double>> data(3, std::vector<double>(2, 10.0));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_eigen_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<Eigen::VectorXd> data(3, Eigen::VectorXd::Ones(2));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_eigen_row_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<Eigen::RowVectorXd> data(3, Eigen::RowVectorXd::Ones(2));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_eigen_matrix_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<Eigen::MatrixXd> data(3, Eigen::MatrixXd::Ones(2, 4));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_std_vector_std_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<std::vector<double>>> data(
+      3, std::vector<std::vector<double>>(2, std::vector<double>(2, 10.0)));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum_static,
+     std_vector_std_vector_eigen_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<Eigen::VectorXd>> data(
+      3, std::vector<Eigen::VectorXd>(2, Eigen::VectorXd::Ones(2)));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum_static,
+     std_vector_std_vector_eigen_row_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<Eigen::RowVectorXd>> data(
+      3, std::vector<Eigen::RowVectorXd>(2, Eigen::RowVectorXd::Ones(2)));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum_static,
+     std_vector_std_vector_eigen_matrix_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<Eigen::MatrixXd>> data(
+      3, std::vector<Eigen::MatrixXd>(2, Eigen::MatrixXd::Ones(2, 4)));
+
+  stan::test::expect_ad(f, data);
+}
+
+struct start_end_lpdf {
+  template <typename T1, typename T2>
+  inline auto operator()(std::size_t start, std::size_t end, T1&&,
+                         std::ostream* msgs, T2&& data) const {
+    stan::return_type_t<T1, T2> sum = 0;
+    EXPECT_GE(start, 0);
+    EXPECT_LE(end, data.size() - 1);
+    for (size_t i = start; i <= end; i++) {
+      sum += data[i];
+    }
+    return sum;
+  }
+};
+
+TEST(StanMath_reduce_sum_static, start_end_slice) {
+  auto start_end = [](auto&& arg) {
+    return stan::math::reduce_sum_static<start_end_lpdf>(arg, 1, msgs, arg);
+  };
+
+  std::vector<double> data(5, 1.0);
+
+  stan::test::expect_ad(start_end, data);
+}
+
+auto fi = [](auto&&... args) {
+  return stan::math::reduce_sum_static<sum_lpdf>(std::vector<int>(2, 10.0), 1,
+                                                 msgs, args...);
+};
+
+auto fd = [](auto&& data, auto&&... args) {
+  return stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, args...);
+};
+
+TEST(MathMix_reduce_sum_static, int_arg) {
+  std::vector<double> data(2, 1.0);
+  int arg = 5.0;
+
+  stan::test::expect_ad([&](auto&& data) { return fd(data, arg); }, data);
+}
+
+TEST(MathMix_reduce_sum_static, double_arg) {
+  std::vector<double> data(2, 10.0);
+  double arg = 5.0;
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_int_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<int> arg(2, 10);
+
+  stan::test::expect_ad([&](auto&& data) { return fd(data, arg); }, data);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_double_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<double> arg(2, 10.0);
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  Eigen::VectorXd arg = Eigen::VectorXd::Ones(2);
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_row_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  Eigen::RowVectorXd arg = Eigen::RowVectorXd::Ones(2);
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_matrix_arg) {
+  std::vector<double> data(2, 10.0);
+  Eigen::MatrixXd arg = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_std_vector_double_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<double>> arg(2, std::vector<double>(2, 10.0));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_eigen_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<Eigen::VectorXd> arg(2, Eigen::VectorXd::Ones(2));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_eigen_row_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<Eigen::RowVectorXd> arg(2, Eigen::RowVectorXd::Ones(2));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_eigen_matrix_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<Eigen::MatrixXd> arg(2, Eigen::MatrixXd::Ones(2, 2));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_std_vector_std_vector_double_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<std::vector<double>>> arg(
+      2, std::vector<std::vector<double>>(2, std::vector<double>(2, 10.0)));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_std_vector_eigen_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<Eigen::VectorXd>> arg(
+      2, std::vector<Eigen::VectorXd>(2, Eigen::VectorXd::Ones(2)));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_std_vector_eigen_row_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<Eigen::RowVectorXd>> arg(
+      2, std::vector<Eigen::RowVectorXd>(2, Eigen::RowVectorXd::Ones(2)));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, std_vector_std_vector_eigen_matrix_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<Eigen::MatrixXd>> arg(
+      2, std::vector<Eigen::MatrixXd>(2, Eigen::MatrixXd::Ones(2, 2)));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args1) {
+  Eigen::VectorXd arg1 = Eigen::VectorXd::Ones(2);
+  Eigen::RowVectorXd arg2 = Eigen::RowVectorXd::Ones(2);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(fi, arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args2) {
+  double arg1 = 1.0;
+  std::vector<double> arg2(2, 1.0);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(fi, arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args3) {
+  double arg1 = 1.0;
+  std::vector<std::vector<double>> arg2(2, std::vector<double>(2, 1.0));
+  std::vector<Eigen::MatrixXd> arg3(2, Eigen::MatrixXd::Ones(2, 2));
+
+  stan::test::expect_ad(fi, arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args_with_ints1) {
+  Eigen::VectorXd arg1 = Eigen::VectorXd::Ones(2);
+  Eigen::RowVectorXd arg2 = Eigen::RowVectorXd::Ones(2);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(1, arg1, std::vector<int>{1, 2, 3}, arg2, 3, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args_with_ints2) {
+  double arg1 = 1.0;
+  std::vector<double> arg2(2, 1.0);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(1, arg1, std::vector<int>{1, 2, 3}, arg2, 3, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args_with_ints3) {
+  double arg1 = 1.0;
+  std::vector<std::vector<double>> arg2(2, std::vector<double>(2, 1.0));
+  std::vector<Eigen::MatrixXd> arg3(2, Eigen::MatrixXd::Ones(2, 2));
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(1, arg1, std::vector<int>{1, 2, 3}, arg2, 3, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args_with_doubles1) {
+  Eigen::VectorXd arg1 = Eigen::VectorXd::Ones(2);
+  Eigen::RowVectorXd arg2 = Eigen::RowVectorXd::Ones(2);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(std::vector<double>{1.0, 2.0, 3.0}, arg1, 3.0, arg2,
+                  std::vector<double>{1.0, 2.0, 3.0}, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args_with_doubles2) {
+  double arg1 = 1.0;
+  std::vector<double> arg2(2, 1.0);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(std::vector<double>{1.0, 2.0, 3.0}, arg1, 3.0, arg2,
+                  std::vector<double>{1.0, 2.0, 3.0}, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum_static, eigen_three_args_with_doubles3) {
+  double arg1 = 1.0;
+  std::vector<std::vector<double>> arg2(2, std::vector<double>(2, 1.0));
+  std::vector<Eigen::MatrixXd> arg3(2, Eigen::MatrixXd::Ones(2, 2));
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(std::vector<double>{1.0, 2.0, 3.0}, arg1, 3.0, arg2,
+                  std::vector<double>{1.0, 2.0, 3.0}, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+template <int grainsize>
+struct static_check_lpdf {
+  template <typename T>
+  inline auto operator()(std::size_t start, std::size_t end,
+                         const std::vector<int>&, std::ostream* msgs,
+                         const std::vector<T>& data) const {
+    T sum = 0;
+    // std::cout << "start: " << start << ", end: " << end << ", grainsize: " <<
+    // grainsize << std::endl;
+    EXPECT_LE(end - start + 1, grainsize);
+    for (size_t i = start; i <= end; i++) {
+      sum += data[i];
+    }
+    return sum;
+  }
+};
+
+TEST(StanMathPrim_reduce_sum_static, static_check) {
+  stan::math::init_threadpool_tbb();
+
+  for (auto size : {1, 3, 6, 11}) {
+    std::vector<int> data(size, 10);
+    std::vector<double> arg(size, 10);
+
+    auto fi1 = [&](auto&&... args) {
+      return stan::math::reduce_sum_static<static_check_lpdf<1>>(data, 1, msgs,
+                                                                 args...);
+    };
+
+    auto fi2 = [&](auto&&... args) {
+      return stan::math::reduce_sum_static<static_check_lpdf<2>>(data, 2, msgs,
+                                                                 args...);
+    };
+
+    auto fi3 = [&](auto&&... args) {
+      return stan::math::reduce_sum_static<static_check_lpdf<3>>(data, 3, msgs,
+                                                                 args...);
+    };
+
+    stan::test::expect_ad(fi1, arg);
+    stan::test::expect_ad(fi2, arg);
+    stan::test::expect_ad(fi3, arg);
+  }
+}
diff --git a/test/unit/math/mix/functor/reduce_sum_test.cpp b/test/unit/math/mix/functor/reduce_sum_test.cpp
new file mode 100644
index 00000000000..ed520821d33
--- /dev/null
+++ b/test/unit/math/mix/functor/reduce_sum_test.cpp
@@ -0,0 +1,409 @@
+#include <stan/math/prim/meta.hpp>
+#include <test/unit/math/test_ad.hpp>
+
+#include <limits>
+#include <vector>
+
+std::ostream* msgs = nullptr;
+
+template <typename T, stan::require_stan_scalar_t<T>* = nullptr>
+T sum_(T arg) {
+  return arg;
+}
+
+template <typename EigMat, stan::require_eigen_t<EigMat>* = nullptr>
+auto sum_(EigMat&& arg) {
+  return stan::math::sum(arg);
+}
+
+template <typename Vec, stan::require_std_vector_t<Vec>* = nullptr>
+auto sum_(Vec&& arg) {
+  stan::scalar_type_t<Vec> sum = 0;
+  for (size_t i = 0; i < arg.size(); ++i) {
+    sum += sum_(arg[i]);
+  }
+  return sum;
+}
+
+struct sum_lpdf {
+  template <typename T, typename... Args>
+  inline auto operator()(std::size_t start, std::size_t end, T&& sub_slice,
+                         std::ostream* msgs, Args&&... args) const {
+    using return_type = stan::return_type_t<T, Args...>;
+
+    return sum_(sub_slice)
+           + sub_slice.size()
+                 * stan::math::sum(std::vector<return_type>{
+                       return_type(sum_(std::forward<Args>(args)))...});
+  }
+};
+
+TEST(MathMix_reduce_sum, grainsize) {
+  auto f1 = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 0, msgs);
+  };
+  auto f2 = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, -1, msgs);
+  };
+  auto f3 = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+  auto f4 = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 20, msgs);
+  };
+
+  std::vector<double> data(5, 10.0);
+
+  stan::test::expect_ad(f1, data);
+  stan::test::expect_ad(f2, data);
+  stan::test::expect_ad(f3, data);
+  stan::test::expect_ad(f4, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<double> data(5, 10.0);
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<double>> data(3, std::vector<double>(2, 10.0));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_eigen_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<Eigen::VectorXd> data(3, Eigen::VectorXd::Ones(2));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_eigen_row_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<Eigen::RowVectorXd> data(3, Eigen::RowVectorXd::Ones(2));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_eigen_matrix_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<Eigen::MatrixXd> data(3, Eigen::MatrixXd::Ones(2, 4));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_std_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<std::vector<double>>> data(
+      3, std::vector<std::vector<double>>(2, std::vector<double>(2, 10.0)));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_eigen_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<Eigen::VectorXd>> data(
+      3, std::vector<Eigen::VectorXd>(2, Eigen::VectorXd::Ones(2)));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_eigen_row_vector_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<Eigen::RowVectorXd>> data(
+      3, std::vector<Eigen::RowVectorXd>(2, Eigen::RowVectorXd::Ones(2)));
+
+  stan::test::expect_ad(f, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_eigen_matrix_double_slice) {
+  auto f = [](auto&& data) {
+    return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs);
+  };
+
+  std::vector<std::vector<Eigen::MatrixXd>> data(
+      3, std::vector<Eigen::MatrixXd>(2, Eigen::MatrixXd::Ones(2, 4)));
+
+  stan::test::expect_ad(f, data);
+}
+
+struct start_end_lpdf {
+  template <typename T1, typename T2>
+  inline auto operator()(std::size_t start, std::size_t end, T1&&,
+                         std::ostream* msgs, T2&& data) const {
+    stan::return_type_t<T1, T2> sum = 0;
+    EXPECT_GE(start, 0);
+    EXPECT_LE(end, data.size() - 1);
+    for (size_t i = start; i <= end; i++) {
+      sum += data[i];
+    }
+    return sum;
+  }
+};
+
+TEST(StanMath_reduce_sum, start_end_slice) {
+  auto start_end = [](auto&& arg) {
+    return stan::math::reduce_sum<start_end_lpdf>(arg, 1, msgs, arg);
+  };
+
+  std::vector<double> data(5, 1.0);
+
+  stan::test::expect_ad(start_end, data);
+}
+
+auto fi = [](auto&&... args) {
+  return stan::math::reduce_sum<sum_lpdf>(std::vector<int>(2, 10.0), 1, msgs,
+                                          args...);
+};
+
+auto fd = [](auto&& data, auto&&... args) {
+  return stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, args...);
+};
+
+TEST(MathMix_reduce_sum, int_arg) {
+  std::vector<double> data(2, 1.0);
+  int arg = 5.0;
+
+  stan::test::expect_ad([&](auto&& data) { return fd(data, arg); }, data);
+}
+
+TEST(MathMix_reduce_sum, double_arg) {
+  std::vector<double> data(2, 10.0);
+  double arg = 5.0;
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_int_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<int> arg(2, 10);
+
+  stan::test::expect_ad([&](auto&& data) { return fd(data, arg); }, data);
+}
+
+TEST(MathMix_reduce_sum, std_vector_double_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<double> arg(2, 10.0);
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, eigen_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  Eigen::VectorXd arg = Eigen::VectorXd::Ones(2);
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, eigen_row_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  Eigen::RowVectorXd arg = Eigen::RowVectorXd::Ones(2);
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, eigen_matrix_arg) {
+  std::vector<double> data(2, 10.0);
+  Eigen::MatrixXd arg = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_double_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<double>> arg(2, std::vector<double>(2, 10.0));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_eigen_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<Eigen::VectorXd> arg(2, Eigen::VectorXd::Ones(2));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_eigen_row_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<Eigen::RowVectorXd> arg(2, Eigen::RowVectorXd::Ones(2));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_eigen_matrix_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<Eigen::MatrixXd> arg(2, Eigen::MatrixXd::Ones(2, 2));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_std_vector_double_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<std::vector<double>>> arg(
+      2, std::vector<std::vector<double>>(2, std::vector<double>(2, 10.0)));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_eigen_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<Eigen::VectorXd>> arg(
+      2, std::vector<Eigen::VectorXd>(2, Eigen::VectorXd::Ones(2)));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_eigen_row_vector_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<Eigen::RowVectorXd>> arg(
+      2, std::vector<Eigen::RowVectorXd>(2, Eigen::RowVectorXd::Ones(2)));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, std_vector_std_vector_eigen_matrix_arg) {
+  std::vector<double> data(2, 10.0);
+  std::vector<std::vector<Eigen::MatrixXd>> arg(
+      2, std::vector<Eigen::MatrixXd>(2, Eigen::MatrixXd::Ones(2, 2)));
+
+  stan::test::expect_ad(fi, arg);
+  stan::test::expect_ad(fd, data, arg);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args1) {
+  Eigen::VectorXd arg1 = Eigen::VectorXd::Ones(2);
+  Eigen::RowVectorXd arg2 = Eigen::RowVectorXd::Ones(2);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(fi, arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args2) {
+  double arg1 = 1.0;
+  std::vector<double> arg2(2, 1.0);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(fi, arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args3) {
+  double arg1 = 1.0;
+  std::vector<std::vector<double>> arg2(2, std::vector<double>(2, 1.0));
+  std::vector<Eigen::MatrixXd> arg3(2, Eigen::MatrixXd::Ones(2, 2));
+
+  stan::test::expect_ad(fi, arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args_with_ints1) {
+  Eigen::VectorXd arg1 = Eigen::VectorXd::Ones(2);
+  Eigen::RowVectorXd arg2 = Eigen::RowVectorXd::Ones(2);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(1, arg1, std::vector<int>{1, 2, 3}, arg2, 3, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args_with_ints2) {
+  double arg1 = 1.0;
+  std::vector<double> arg2(2, 1.0);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(1, arg1, std::vector<int>{1, 2, 3}, arg2, 3, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args_with_ints3) {
+  double arg1 = 1.0;
+  std::vector<std::vector<double>> arg2(2, std::vector<double>(2, 1.0));
+  std::vector<Eigen::MatrixXd> arg3(2, Eigen::MatrixXd::Ones(2, 2));
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(1, arg1, std::vector<int>{1, 2, 3}, arg2, 3, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args_with_doubles1) {
+  Eigen::VectorXd arg1 = Eigen::VectorXd::Ones(2);
+  Eigen::RowVectorXd arg2 = Eigen::RowVectorXd::Ones(2);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(std::vector<double>{1.0, 2.0, 3.0}, arg1, 3.0, arg2,
+                  std::vector<double>{1.0, 2.0, 3.0}, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args_with_doubles2) {
+  double arg1 = 1.0;
+  std::vector<double> arg2(2, 1.0);
+  Eigen::MatrixXd arg3 = Eigen::MatrixXd::Ones(2, 2);
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(std::vector<double>{1.0, 2.0, 3.0}, arg1, 3.0, arg2,
+                  std::vector<double>{1.0, 2.0, 3.0}, arg3);
+      },
+      arg1, arg2, arg3);
+}
+
+TEST(MathMix_reduce_sum, eigen_three_args_with_doubles3) {
+  double arg1 = 1.0;
+  std::vector<std::vector<double>> arg2(2, std::vector<double>(2, 1.0));
+  std::vector<Eigen::MatrixXd> arg3(2, Eigen::MatrixXd::Ones(2, 2));
+
+  stan::test::expect_ad(
+      [&](auto&& arg1, auto&& arg2, auto&& arg3) {
+        return fi(std::vector<double>{1.0, 2.0, 3.0}, arg1, 3.0, arg2,
+                  std::vector<double>{1.0, 2.0, 3.0}, arg3);
+      },
+      arg1, arg2, arg3);
+}
diff --git a/test/unit/math/prim/functor/reduce_sum_static_test.cpp b/test/unit/math/prim/functor/reduce_sum_static_test.cpp
new file mode 100644
index 00000000000..8c6ee821d54
--- /dev/null
+++ b/test/unit/math/prim/functor/reduce_sum_static_test.cpp
@@ -0,0 +1,476 @@
+#include <stan/math/prim/core/init_threadpool_tbb.hpp>
+#include <stan/math.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+std::ostream* msgs = nullptr;
+
+// reduce functor which is the BinaryFunction
+// here we use iterators which represent integer indices
+template <typename T>
+struct count_lpdf {
+  count_lpdf() {}
+
+  // does the reduction in the sub-slice start to end
+  inline T operator()(std::size_t start, std::size_t end,
+                      const std::vector<int>& sub_slice, std::ostream* msgs,
+                      const std::vector<T>& lambda,
+                      const std::vector<int>& idata) const {
+    return stan::math::poisson_lpmf(sub_slice, lambda[0]);
+  }
+};
+
+TEST(StanMathPrim_reduce_sum_static, value) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t elems = 10000;
+  const std::size_t num_iter = 1000;
+  std::vector<int> data(elems);
+
+  for (std::size_t i = 0; i != elems; ++i)
+    data[i] = i;
+
+  std::vector<int> idata;
+  std::vector<double> vlambda_d(1, lambda_d);
+
+  double poisson_lpdf = stan::math::reduce_sum_static<count_lpdf<double>>(
+      data, 5, msgs, vlambda_d, idata);
+
+  double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d);
+
+  EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref);
+}
+
+// ********************************
+// test if nested parallelism works
+// ********************************
+
+template <typename T>
+struct nesting_count_lpdf {
+  nesting_count_lpdf() {}
+
+  // does the reduction in the sub-slice start to end
+  inline T operator()(std::size_t start, std::size_t end,
+                      const std::vector<int>& sub_slice, std::ostream* msgs,
+                      const std::vector<T>& lambda,
+                      const std::vector<int>& idata) const {
+    return stan::math::reduce_sum_static<count_lpdf<T>>(sub_slice, 5, msgs,
+                                                        lambda, idata);
+  }
+};
+
+TEST(StanMathPrim_reduce_sum_static, nesting_value) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t elems = 10000;
+  const std::size_t num_iter = 1000;
+  std::vector<int> data(elems);
+
+  for (std::size_t i = 0; i != elems; ++i)
+    data[i] = i;
+
+  std::vector<int> idata;
+  std::vector<double> vlambda_d(1, lambda_d);
+
+  double poisson_lpdf = stan::math::reduce_sum_static<count_lpdf<double>>(
+      data, 5, msgs, vlambda_d, idata);
+
+  double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d);
+
+  EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref);
+}
+
+template <typename T, stan::require_stan_scalar_t<T>* = nullptr>
+T sum_(T arg) {
+  return arg;
+}
+
+template <typename EigMat, stan::require_eigen_t<EigMat>* = nullptr>
+auto sum_(EigMat&& arg) {
+  return stan::math::sum(arg);
+}
+
+template <typename Vec, stan::require_std_vector_t<Vec>* = nullptr>
+auto sum_(Vec&& arg) {
+  stan::scalar_type_t<Vec> sum = 0;
+  for (size_t i = 0; i < arg.size(); ++i) {
+    sum += sum_(arg[i]);
+  }
+  return sum;
+}
+
+struct sum_lpdf {
+  template <typename T, typename... Args>
+  inline auto operator()(std::size_t start, std::size_t end, T&& sub_slice,
+                         std::ostream* msgs, Args&&... args) const {
+    using return_type = stan::return_type_t<T, Args...>;
+
+    return sum_(sub_slice)
+           + sub_slice.size()
+                 * stan::math::sum(std::vector<return_type>{
+                       return_type(sum_(std::forward<Args>(args)))...});
+  }
+};
+
+TEST(StanMathPrim_reduce_sum_static, grainsize) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<int> data(5, 10);
+
+  EXPECT_THROW(stan::math::reduce_sum_static<sum_lpdf>(data, 0, msgs),
+               std::domain_error);
+
+  EXPECT_THROW(stan::math::reduce_sum_static<sum_lpdf>(data, -1, msgs),
+               std::domain_error);
+
+  EXPECT_NO_THROW(stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+
+  EXPECT_NO_THROW(
+      stan::math::reduce_sum_static<sum_lpdf>(data, 3 * data.size(), msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_int_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<int> data(5, 10);
+
+  EXPECT_EQ(50, stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_double_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+
+  EXPECT_DOUBLE_EQ(50.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_std_vector_double_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<double>> data(5, std::vector<double>(2, 10.0));
+
+  EXPECT_DOUBLE_EQ(100.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_std_vector_int_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<int>> data(5, std::vector<int>(2, 10));
+
+  EXPECT_EQ(100, stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_eigen_vector_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<Eigen::VectorXd> data(5, Eigen::VectorXd::Ones(2));
+
+  EXPECT_DOUBLE_EQ(10.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_eigen_row_vector_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<Eigen::RowVectorXd> data(5, Eigen::RowVectorXd::Ones(2));
+
+  EXPECT_DOUBLE_EQ(10.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_eigen_matrix_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<Eigen::MatrixXd> data(5, Eigen::MatrixXd::Ones(2, 2));
+
+  EXPECT_DOUBLE_EQ(20.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static,
+     std_vector_std_vector_std_vector_int_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<std::vector<int>>> data(
+      5, std::vector<std::vector<int>>(2, std::vector<int>(2, 10.0)));
+
+  EXPECT_DOUBLE_EQ(200, stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static,
+     std_vector_std_vector_std_vector_double_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<std::vector<double>>> data(
+      5, std::vector<std::vector<double>>(2, std::vector<double>(2, 10.0)));
+
+  EXPECT_DOUBLE_EQ(200.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_std_vector_eigen_vector_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<Eigen::VectorXd>> data(
+      5, std::vector<Eigen::VectorXd>(2, Eigen::VectorXd::Ones(2)));
+
+  EXPECT_DOUBLE_EQ(20.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static,
+     std_vector_std_vector_eigen_row_vector_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<Eigen::RowVectorXd>> data(
+      5, std::vector<Eigen::RowVectorXd>(2, Eigen::RowVectorXd::Ones(2)));
+
+  EXPECT_DOUBLE_EQ(20.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_std_vector_eigen_matrix_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<Eigen::MatrixXd>> data(
+      5, std::vector<Eigen::MatrixXd>(2, Eigen::MatrixXd::Ones(2, 2)));
+
+  EXPECT_DOUBLE_EQ(40.0,
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs));
+}
+
+struct start_end_lpdf {
+  inline auto operator()(std::size_t start, std::size_t end,
+                         const std::vector<int>&, std::ostream* msgs,
+                         const std::vector<int>& data) const {
+    int sum = 0;
+    EXPECT_GE(start, 0);
+    EXPECT_LE(end, data.size() - 1);
+    for (size_t i = start; i <= end; i++) {
+      sum += data[i];
+    }
+    return sum;
+  }
+};
+
+TEST(StanMathPrim_reduce_sum_static, start_end_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<int> data(5, 10);
+
+  EXPECT_EQ(50,
+            stan::math::reduce_sum_static<start_end_lpdf>(data, 1, msgs, data));
+}
+
+TEST(StanMathPrim_reduce_sum_static, int_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  int arg = 5;
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, double_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  double arg = 5.0;
+
+  EXPECT_DOUBLE_EQ(5 * (10.0 + 5.0),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_int_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<int> arg(5, 10);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5 * 10),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_double_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<double> arg(5, 10.0);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5 * 10),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, eigen_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  Eigen::VectorXd arg = Eigen::VectorXd::Ones(5);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, eigen_row_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  Eigen::RowVectorXd arg = Eigen::RowVectorXd::Ones(5);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, eigen_matrix_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  Eigen::MatrixXd arg = Eigen::MatrixXd::Ones(5, 5);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5 * 5),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_std_vector_double_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<double>> arg(5, std::vector<double>(5, 10.0));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 250),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_eigen_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<Eigen::VectorXd> arg(2, Eigen::VectorXd::Ones(5));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 10),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_eigen_row_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<Eigen::RowVectorXd> arg(2, Eigen::RowVectorXd::Ones(5));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 10),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_eigen_matrix_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<Eigen::MatrixXd> arg(2, Eigen::MatrixXd::Ones(5, 5));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 50),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static,
+     std_vector_std_vector_std_vector_double_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<std::vector<double>>> arg(
+      5, std::vector<std::vector<double>>(5, std::vector<double>(5, 10.0)));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 1250),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_std_vector_eigen_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<Eigen::VectorXd>> arg(
+      2, std::vector<Eigen::VectorXd>(2, Eigen::VectorXd::Ones(5)));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 20),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static,
+     std_vector_std_vector_eigen_row_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<Eigen::RowVectorXd>> arg(
+      2, std::vector<Eigen::RowVectorXd>(2, Eigen::RowVectorXd::Ones(5)));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 20),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, std_vector_std_vector_eigen_matrix_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<Eigen::MatrixXd>> arg(
+      2, std::vector<Eigen::MatrixXd>(2, Eigen::MatrixXd::Ones(5, 3)));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 60),
+                   stan::math::reduce_sum_static<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum_static, sum) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 1.0);
+  int arg1 = 1;
+  double arg2 = 1.0;
+  std::vector<int> arg3(5, 1);
+  std::vector<double> arg4(5, 1.0);
+  Eigen::VectorXd arg5 = Eigen::VectorXd::Ones(5);
+  Eigen::RowVectorXd arg6 = Eigen::RowVectorXd::Ones(5);
+  Eigen::MatrixXd arg7 = Eigen::MatrixXd::Ones(5, 5);
+  std::vector<std::vector<double>> arg8(2, arg4);
+  std::vector<Eigen::VectorXd> arg9(2, arg5);
+
+  EXPECT_DOUBLE_EQ(
+      5 + 5 * (1 + 1 + 5 + 5 + 5 + 5 + 25 + 10 + 10),
+      stan::math::reduce_sum_static<sum_lpdf>(
+          data, 1, msgs, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9));
+}
+
+template <int grainsize>
+struct static_check_lpdf {
+  inline auto operator()(std::size_t start, std::size_t end,
+                         const std::vector<int>&, std::ostream* msgs,
+                         const std::vector<int>& data) const {
+    int sum = 0;
+    EXPECT_LE(end - start + 1, grainsize);
+    for (size_t i = start; i <= end; i++) {
+      sum += data[i];
+    }
+    return sum;
+  }
+};
+
+TEST(StanMathPrim_reduce_sum_static, static_check) {
+  stan::math::init_threadpool_tbb();
+
+  for (auto size : {10, 16, 19, 31}) {
+    std::vector<int> data(size, 10);
+
+    EXPECT_NO_THROW(stan::math::reduce_sum_static<static_check_lpdf<1>>(
+        data, 1, msgs, data));
+    EXPECT_NO_THROW(stan::math::reduce_sum_static<static_check_lpdf<2>>(
+        data, 2, msgs, data));
+    EXPECT_NO_THROW(stan::math::reduce_sum_static<static_check_lpdf<3>>(
+        data, 3, msgs, data));
+  }
+}
diff --git a/test/unit/math/prim/functor/reduce_sum_test.cpp b/test/unit/math/prim/functor/reduce_sum_test.cpp
new file mode 100644
index 00000000000..fe4473f3e7d
--- /dev/null
+++ b/test/unit/math/prim/functor/reduce_sum_test.cpp
@@ -0,0 +1,432 @@
+#include <stan/math/prim/core/init_threadpool_tbb.hpp>
+#include <stan/math.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+std::ostream* msgs = nullptr;
+
+// reduce functor which is the BinaryFunction
+// here we use iterators which represent integer indices
+template <typename T>
+struct count_lpdf {
+  count_lpdf() {}
+
+  // does the reduction in the sub-slice start to end
+  inline T operator()(std::size_t start, std::size_t end,
+                      const std::vector<int>& sub_slice, std::ostream* msgs,
+                      const std::vector<T>& lambda,
+                      const std::vector<int>& idata) const {
+    return stan::math::poisson_lpmf(sub_slice, lambda[0]);
+  }
+};
+
+TEST(StanMathPrim_reduce_sum, value) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t elems = 10000;
+  const std::size_t num_iter = 1000;
+  std::vector<int> data(elems);
+
+  for (std::size_t i = 0; i != elems; ++i)
+    data[i] = i;
+
+  std::vector<int> idata;
+  std::vector<double> vlambda_d(1, lambda_d);
+
+  double poisson_lpdf = stan::math::reduce_sum<count_lpdf<double>>(
+      data, 5, msgs, vlambda_d, idata);
+
+  double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d);
+
+  EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref);
+}
+
+// ********************************
+// test if nested parallelism works
+// ********************************
+
+template <typename T>
+struct nesting_count_lpdf {
+  nesting_count_lpdf() {}
+
+  // does the reduction in the sub-slice start to end
+  inline T operator()(std::size_t start, std::size_t end,
+                      const std::vector<int>& sub_slice, std::ostream* msgs,
+                      const std::vector<T>& lambda,
+                      const std::vector<int>& idata) const {
+    return stan::math::reduce_sum<count_lpdf<T>>(sub_slice, 5, msgs, lambda,
+                                                 idata);
+  }
+};
+
+TEST(StanMathPrim_reduce_sum, nesting_value) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t elems = 10000;
+  const std::size_t num_iter = 1000;
+  std::vector<int> data(elems);
+
+  for (std::size_t i = 0; i != elems; ++i)
+    data[i] = i;
+
+  std::vector<int> idata;
+  std::vector<double> vlambda_d(1, lambda_d);
+
+  double poisson_lpdf = stan::math::reduce_sum<count_lpdf<double>>(
+      data, 5, msgs, vlambda_d, idata);
+
+  double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d);
+
+  EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref);
+}
+
+template <typename T, stan::require_stan_scalar_t<T>* = nullptr>
+T sum_(T arg) {
+  return arg;
+}
+
+template <typename EigMat, stan::require_eigen_t<EigMat>* = nullptr>
+auto sum_(EigMat&& arg) {
+  return stan::math::sum(arg);
+}
+
+template <typename Vec, stan::require_std_vector_t<Vec>* = nullptr>
+auto sum_(Vec&& arg) {
+  stan::scalar_type_t<Vec> sum = 0;
+  for (size_t i = 0; i < arg.size(); ++i) {
+    sum += sum_(arg[i]);
+  }
+  return sum;
+}
+
+struct sum_lpdf {
+  template <typename T, typename... Args>
+  inline auto operator()(std::size_t start, std::size_t end, T&& sub_slice,
+                         std::ostream* msgs, Args&&... args) const {
+    using return_type = stan::return_type_t<T, Args...>;
+
+    return sum_(sub_slice)
+           + sub_slice.size()
+                 * stan::math::sum(std::vector<return_type>{
+                       return_type(sum_(std::forward<Args>(args)))...});
+  }
+};
+
+TEST(StanMathPrim_reduce_sum, grainsize) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<int> data(5, 10);
+
+  EXPECT_THROW(stan::math::reduce_sum<sum_lpdf>(data, 0, msgs),
+               std::domain_error);
+
+  EXPECT_THROW(stan::math::reduce_sum<sum_lpdf>(data, -1, msgs),
+               std::domain_error);
+
+  EXPECT_NO_THROW(stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+
+  EXPECT_NO_THROW(
+      stan::math::reduce_sum<sum_lpdf>(data, 3 * data.size(), msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_int_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<int> data(5, 10);
+
+  EXPECT_EQ(50, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_double_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+
+  EXPECT_DOUBLE_EQ(50.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_double_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<double>> data(5, std::vector<double>(2, 10.0));
+
+  EXPECT_DOUBLE_EQ(100.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_int_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<int>> data(5, std::vector<int>(2, 10));
+
+  EXPECT_EQ(100, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_eigen_vector_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<Eigen::VectorXd> data(5, Eigen::VectorXd::Ones(2));
+
+  EXPECT_DOUBLE_EQ(10.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_eigen_row_vector_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<Eigen::RowVectorXd> data(5, Eigen::RowVectorXd::Ones(2));
+
+  EXPECT_DOUBLE_EQ(10.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_eigen_matrix_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<Eigen::MatrixXd> data(5, Eigen::MatrixXd::Ones(2, 2));
+
+  EXPECT_DOUBLE_EQ(20.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_std_vector_int_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<std::vector<int>>> data(
+      5, std::vector<std::vector<int>>(2, std::vector<int>(2, 10.0)));
+
+  EXPECT_DOUBLE_EQ(200, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_std_vector_double_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<std::vector<double>>> data(
+      5, std::vector<std::vector<double>>(2, std::vector<double>(2, 10.0)));
+
+  EXPECT_DOUBLE_EQ(200.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_eigen_vector_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<Eigen::VectorXd>> data(
+      5, std::vector<Eigen::VectorXd>(2, Eigen::VectorXd::Ones(2)));
+
+  EXPECT_DOUBLE_EQ(20.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_eigen_row_vector_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<Eigen::RowVectorXd>> data(
+      5, std::vector<Eigen::RowVectorXd>(2, Eigen::RowVectorXd::Ones(2)));
+
+  EXPECT_DOUBLE_EQ(20.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_eigen_matrix_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<std::vector<Eigen::MatrixXd>> data(
+      5, std::vector<Eigen::MatrixXd>(2, Eigen::MatrixXd::Ones(2, 2)));
+
+  EXPECT_DOUBLE_EQ(40.0, stan::math::reduce_sum<sum_lpdf>(data, 1, msgs));
+}
+
+struct start_end_lpdf {
+  inline auto operator()(std::size_t start, std::size_t end,
+                         const std::vector<int>&, std::ostream* msgs,
+                         const std::vector<int>& data) const {
+    int sum = 0;
+    EXPECT_GE(start, 0);
+    EXPECT_LE(end, data.size() - 1);
+    for (size_t i = start; i <= end; i++) {
+      sum += data[i];
+    }
+    return sum;
+  }
+};
+
+TEST(StanMathPrim_reduce_sum, start_end_slice) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<int> data(5, 10);
+
+  EXPECT_EQ(50, stan::math::reduce_sum<start_end_lpdf>(data, 1, msgs, data));
+}
+
+TEST(StanMathPrim_reduce_sum, int_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  int arg = 5;
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, double_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  double arg = 5.0;
+
+  EXPECT_DOUBLE_EQ(5 * (10.0 + 5.0),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_int_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<int> arg(5, 10);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5 * 10),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_double_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<double> arg(5, 10.0);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5 * 10),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, eigen_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  Eigen::VectorXd arg = Eigen::VectorXd::Ones(5);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, eigen_row_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  Eigen::RowVectorXd arg = Eigen::RowVectorXd::Ones(5);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, eigen_matrix_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  Eigen::MatrixXd arg = Eigen::MatrixXd::Ones(5, 5);
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 5 * 5),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_double_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<double>> arg(5, std::vector<double>(5, 10.0));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 250),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_eigen_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<Eigen::VectorXd> arg(2, Eigen::VectorXd::Ones(5));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 10),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_eigen_row_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<Eigen::RowVectorXd> arg(2, Eigen::RowVectorXd::Ones(5));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 10),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_eigen_matrix_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<Eigen::MatrixXd> arg(2, Eigen::MatrixXd::Ones(5, 5));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 50),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_std_vector_double_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<std::vector<double>>> arg(
+      5, std::vector<std::vector<double>>(5, std::vector<double>(5, 10.0)));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 1250),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_eigen_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<Eigen::VectorXd>> arg(
+      2, std::vector<Eigen::VectorXd>(2, Eigen::VectorXd::Ones(5)));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 20),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_eigen_row_vector_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<Eigen::RowVectorXd>> arg(
+      2, std::vector<Eigen::RowVectorXd>(2, Eigen::RowVectorXd::Ones(5)));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 20),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, std_vector_std_vector_eigen_matrix_arg) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 10.0);
+  std::vector<std::vector<Eigen::MatrixXd>> arg(
+      2, std::vector<Eigen::MatrixXd>(2, Eigen::MatrixXd::Ones(5, 3)));
+
+  EXPECT_DOUBLE_EQ(5 * (10 + 60),
+                   stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg));
+}
+
+TEST(StanMathPrim_reduce_sum, sum) {
+  stan::math::init_threadpool_tbb();
+
+  std::vector<double> data(5, 1.0);
+  int arg1 = 1;
+  double arg2 = 1.0;
+  std::vector<int> arg3(5, 1);
+  std::vector<double> arg4(5, 1.0);
+  Eigen::VectorXd arg5 = Eigen::VectorXd::Ones(5);
+  Eigen::RowVectorXd arg6 = Eigen::RowVectorXd::Ones(5);
+  Eigen::MatrixXd arg7 = Eigen::MatrixXd::Ones(5, 5);
+  std::vector<std::vector<double>> arg8(2, arg4);
+  std::vector<Eigen::VectorXd> arg9(2, arg5);
+
+  EXPECT_DOUBLE_EQ(
+      5 + 5 * (1 + 1 + 5 + 5 + 5 + 5 + 25 + 10 + 10),
+      stan::math::reduce_sum<sum_lpdf>(data, 1, msgs, arg1, arg2, arg3, arg4,
+                                       arg5, arg6, arg7, arg8, arg9));
+}
diff --git a/test/unit/math/rev/functor/reduce_sum_test.cpp b/test/unit/math/rev/functor/reduce_sum_test.cpp
new file mode 100644
index 00000000000..810f4b2d52c
--- /dev/null
+++ b/test/unit/math/rev/functor/reduce_sum_test.cpp
@@ -0,0 +1,397 @@
+#include <stan/math/prim/core.hpp>
+#include <stan/math.hpp>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+std::ostream* msgs = nullptr;
+
+// reduce functor which is the BinaryFunction
+// here we use iterators which represent integer indices
+template <typename T>
+struct count_lpdf {
+  count_lpdf() {}
+
+  // does the reduction in the sub-slice start to end
+  inline T operator()(std::size_t start, std::size_t end,
+                      const std::vector<int>& sub_slice, std::ostream* msgs,
+                      const std::vector<T>& lambda,
+                      const std::vector<int>& idata) const {
+    return stan::math::poisson_lpmf(sub_slice, lambda[0]);
+  }
+};
+
+TEST(StanMathRev_reduce_sum, value) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t elems = 10000;
+  const std::size_t num_iter = 1000;
+  std::vector<int> data(elems);
+
+  for (std::size_t i = 0; i != elems; ++i)
+    data[i] = i;
+
+  std::vector<int> idata;
+  std::vector<double> vlambda_d(1, lambda_d);
+
+  double poisson_lpdf = stan::math::reduce_sum<count_lpdf<double>>(
+      data, 5, msgs, vlambda_d, idata);
+
+  double poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_d);
+
+  EXPECT_FLOAT_EQ(poisson_lpdf, poisson_lpdf_ref)
+      << "ref value of poisson lpdf : " << poisson_lpdf_ref << std::endl
+      << "value of poisson lpdf : " << poisson_lpdf << std::endl;
+}
+
+TEST(StanMathRev_reduce_sum, gradient) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t elems = 10000;
+  const std::size_t num_iter = 1000;
+  std::vector<int> data(elems);
+
+  for (std::size_t i = 0; i != elems; ++i)
+    data[i] = i;
+
+  using stan::math::var;
+
+  var lambda_v = lambda_d;
+
+  std::vector<int> idata;
+  std::vector<var> vlambda_v(1, lambda_v);
+
+  var poisson_lpdf = stan::math::reduce_sum<count_lpdf<var>>(data, 5, msgs,
+                                                             vlambda_v, idata);
+
+  var lambda_ref = lambda_d;
+  var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref);
+
+  EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref));
+
+  stan::math::grad(poisson_lpdf_ref.vi_);
+  const double lambda_ref_adj = lambda_ref.adj();
+
+  stan::math::set_zero_all_adjoints();
+  stan::math::grad(poisson_lpdf.vi_);
+  const double lambda_adj = lambda_v.adj();
+
+  EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj)
+      << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() << std::endl
+      << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl
+      << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl
+      << "gradient wrt to lambda: " << lambda_adj << std::endl;
+
+  stan::math::recover_memory();
+}
+TEST(StanMathRev_reduce_sum, grainsize) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t elems = 10000;
+  const std::size_t num_iter = 1000;
+  std::vector<int> data(elems);
+
+  for (std::size_t i = 0; i != elems; ++i)
+    data[i] = i;
+
+  using stan::math::var;
+
+  var lambda_v = lambda_d;
+
+  std::vector<int> idata;
+  std::vector<var> vlambda_v(1, lambda_v);
+
+  EXPECT_THROW(
+      stan::math::reduce_sum<count_lpdf<var>>(data, 0, msgs, vlambda_v, idata),
+      std::domain_error);
+
+  EXPECT_THROW(
+      stan::math::reduce_sum<count_lpdf<var>>(data, -1, msgs, vlambda_v, idata),
+      std::domain_error);
+
+  EXPECT_NO_THROW(
+      stan::math::reduce_sum<count_lpdf<var>>(data, 1, msgs, vlambda_v, idata));
+
+  EXPECT_NO_THROW(stan::math::reduce_sum<count_lpdf<var>>(data, 2 * elems, msgs,
+                                                          vlambda_v, idata));
+
+  stan::math::recover_memory();
+}
+
+// ********************************
+// test if nested parallelism works
+// ********************************
+template <typename T>
+struct nesting_count_lpdf {
+  nesting_count_lpdf() {}
+
+  // does the reduction in the sub-slice start to end
+  inline T operator()(std::size_t start, std::size_t end,
+                      const std::vector<int>& sub_slice, std::ostream* msgs,
+                      const std::vector<T>& lambda,
+                      const std::vector<int>& idata) const {
+    return stan::math::reduce_sum<count_lpdf<T>>(sub_slice, 5, msgs, lambda,
+                                                 idata);
+  }
+};
+
+TEST(StanMathRev_reduce_sum, nesting_gradient) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t elems = 10000;
+  const std::size_t num_iter = 1000;
+  std::vector<int> data(elems);
+
+  for (std::size_t i = 0; i != elems; ++i)
+    data[i] = i;
+
+  using stan::math::var;
+
+  var lambda_v = lambda_d;
+
+  std::vector<int> idata;
+  std::vector<var> vlambda_v(1, lambda_v);
+
+  var poisson_lpdf = stan::math::reduce_sum<nesting_count_lpdf<var>>(
+      data, 5, msgs, vlambda_v, idata);
+
+  var lambda_ref = lambda_d;
+  var poisson_lpdf_ref = stan::math::poisson_lpmf(data, lambda_ref);
+
+  EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref));
+
+  stan::math::grad(poisson_lpdf_ref.vi_);
+  const double lambda_ref_adj = lambda_ref.adj();
+
+  stan::math::set_zero_all_adjoints();
+  stan::math::grad(poisson_lpdf.vi_);
+  const double lambda_adj = lambda_v.adj();
+
+  EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj)
+      << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() << std::endl
+      << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl
+      << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl
+      << "gradient wrt to lambda: " << lambda_adj << std::endl;
+
+  stan::math::recover_memory();
+}
+
+// ********************************
+// basic performance test for a hierarchical model
+// ********************************
+
+template <typename T>
+struct grouped_count_lpdf {
+  grouped_count_lpdf() {}
+
+  // does the reduction in the sub-slice start to end
+  template <typename VecInt1, typename VecT, typename VecInt2>
+  inline T operator()(std::size_t start, std::size_t end, VecInt1&& sub_slice,
+                      std::ostream* msgs, VecT&& lambda, VecInt2&& gidx) const {
+    const std::size_t num_terms = end - start + 1;
+    // std::cout << "sub-slice " << start << " - " << end << "; num_terms = " <<
+    // num_terms << "; size = " << sub_slice.size() << std::endl;
+    std::decay_t<VecT> lambda_slice(num_terms);
+    for (std::size_t i = 0; i != num_terms; ++i)
+      lambda_slice[i] = lambda[gidx[start + i]];
+
+    return stan::math::poisson_lpmf(sub_slice, lambda_slice);
+  }
+};
+
+TEST(StanMathRev_reduce_sum, grouped_gradient) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t groups = 10;
+  const std::size_t elems_per_group = 1000;
+  const std::size_t elems = groups * elems_per_group;
+  const std::size_t num_iter = 1000;
+
+  std::vector<int> data(elems);
+  std::vector<int> gidx(elems);
+
+  for (std::size_t i = 0; i != elems; ++i) {
+    data[i] = i;
+    gidx[i] = i / elems_per_group;
+  }
+
+  using stan::math::var;
+
+  std::vector<var> vlambda_v;
+
+  for (std::size_t i = 0; i != groups; ++i)
+    vlambda_v.push_back(i + 0.2);
+
+  var lambda_v = vlambda_v[0];
+
+  var poisson_lpdf = stan::math::reduce_sum<grouped_count_lpdf<var>>(
+      data, 5, msgs, vlambda_v, gidx);
+
+  std::vector<var> vref_lambda_v;
+  for (std::size_t i = 0; i != elems; ++i) {
+    vref_lambda_v.push_back(vlambda_v[gidx[i]]);
+  }
+  var lambda_ref = vlambda_v[0];
+  var poisson_lpdf_ref = stan::math::poisson_lpmf(data, vref_lambda_v);
+
+  EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref));
+
+  stan::math::grad(poisson_lpdf_ref.vi_);
+  const double lambda_ref_adj = lambda_ref.adj();
+
+  stan::math::set_zero_all_adjoints();
+  stan::math::grad(poisson_lpdf.vi_);
+  const double lambda_adj = lambda_v.adj();
+
+  EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj)
+      << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() << std::endl
+      << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl
+      << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl
+      << "gradient wrt to lambda: " << lambda_adj << std::endl;
+
+  stan::math::recover_memory();
+}
+
+TEST(StanMathRev_reduce_sum, grouped_gradient_eigen) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t groups = 10;
+  const std::size_t elems_per_group = 1000;
+  const std::size_t elems = groups * elems_per_group;
+  const std::size_t num_iter = 1000;
+
+  std::vector<int> data(elems);
+  std::vector<int> gidx(elems);
+
+  for (std::size_t i = 0; i != elems; ++i) {
+    data[i] = i;
+    gidx[i] = i / elems_per_group;
+  }
+  using stan::math::var;
+
+  Eigen::Matrix<var, -1, 1> vlambda_v(groups);
+
+  for (std::size_t i = 0; i != groups; ++i)
+    vlambda_v[i] = i + 0.2;
+  var lambda_v = vlambda_v[0];
+
+  var poisson_lpdf = stan::math::reduce_sum<grouped_count_lpdf<var>>(
+      data, 5, msgs, vlambda_v, gidx);
+
+  std::vector<var> vref_lambda_v;
+  for (std::size_t i = 0; i != elems; ++i) {
+    vref_lambda_v.push_back(vlambda_v[gidx[i]]);
+  }
+  var lambda_ref = vlambda_v[0];
+
+  var poisson_lpdf_ref = stan::math::poisson_lpmf(data, vref_lambda_v);
+
+  EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref));
+
+  stan::math::grad(poisson_lpdf_ref.vi_);
+  const double lambda_ref_adj = lambda_ref.adj();
+
+  stan::math::set_zero_all_adjoints();
+  stan::math::grad(poisson_lpdf.vi_);
+  const double lambda_adj = lambda_v.adj();
+
+  EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj)
+      << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() << std::endl
+      << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl
+      << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl
+      << "gradient wrt to lambda: " << lambda_adj << std::endl;
+
+  stan::math::recover_memory();
+}
+
+// ********************************
+// slice over the grouping variable which is a var
+// ********************************
+
+template <typename T>
+struct slice_group_count_lpdf {
+  slice_group_count_lpdf() {}
+
+  // does the reduction in the sub-slice start to end
+  inline T operator()(std::size_t start, std::size_t end,
+                      const std::vector<T>& lambda_slice, std::ostream* msgs,
+                      const std::vector<int>& y,
+                      const std::vector<int>& gsidx) const {
+    const std::size_t num_groups = end - start + 1;
+    T result = 0.0;
+    for (std::size_t i = 0; i != num_groups; ++i) {
+      std::vector<int> y_group(y.begin() + gsidx[start + i],
+                               y.begin() + gsidx[start + i + 1]);
+      result += stan::math::poisson_lpmf(y_group, lambda_slice[i]);
+    }
+    return result;
+  }
+};
+
+TEST(StanMathRev_reduce_sum, slice_group_gradient) {
+  stan::math::init_threadpool_tbb();
+
+  double lambda_d = 10.0;
+  const std::size_t groups = 10;
+  const std::size_t elems_per_group = 1000;
+  const std::size_t elems = groups * elems_per_group;
+  const std::size_t num_iter = 1000;
+
+  std::vector<int> data(elems);
+  std::vector<int> gidx(elems);
+  std::vector<int> gsidx(groups + 1);
+
+  for (std::size_t i = 0, k = 0; i != groups; ++i) {
+    gsidx[i] = k;
+    for (std::size_t j = 0; j != elems_per_group; ++j, ++k) {
+      data[k] = k;
+      gidx[k] = i;
+    }
+    gsidx[i + 1] = k;
+  }
+
+  using stan::math::var;
+
+  std::vector<var> vlambda_v;
+
+  for (std::size_t i = 0; i != groups; ++i)
+    vlambda_v.push_back(i + 0.2);
+
+  var lambda_v = vlambda_v[0];
+
+  var poisson_lpdf = stan::math::reduce_sum<slice_group_count_lpdf<var>>(
+      vlambda_v, 5, msgs, data, gsidx);
+
+  std::vector<var> vref_lambda_v;
+  for (std::size_t i = 0; i != elems; ++i) {
+    vref_lambda_v.push_back(vlambda_v[gidx[i]]);
+  }
+  var lambda_ref = vlambda_v[0];
+
+  var poisson_lpdf_ref = stan::math::poisson_lpmf(data, vref_lambda_v);
+
+  EXPECT_FLOAT_EQ(value_of(poisson_lpdf), value_of(poisson_lpdf_ref));
+
+  stan::math::grad(poisson_lpdf_ref.vi_);
+  const double lambda_ref_adj = lambda_ref.adj();
+
+  stan::math::set_zero_all_adjoints();
+  stan::math::grad(poisson_lpdf.vi_);
+  const double lambda_adj = lambda_v.adj();
+
+  EXPECT_FLOAT_EQ(lambda_adj, lambda_ref_adj)
+      << "ref value of poisson lpdf : " << poisson_lpdf_ref.val() << std::endl
+      << "ref gradient wrt to lambda: " << lambda_ref_adj << std::endl
+      << "value of poisson lpdf : " << poisson_lpdf.val() << std::endl
+      << "gradient wrt to lambda: " << lambda_adj << std::endl;
+
+  stan::math::recover_memory();
+}