diff --git a/.gitignore b/.gitignore
index 6db11dcac..13f267d9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,5 +13,6 @@
 *.orig
 __pycache__/
 view
+*.cache*
 /_serac_build_and_test*
 build-linux-*-*-*
diff --git a/.gitlab/build_blueos.yml b/.gitlab/build_blueos.yml
index eff30dd57..acca63b60 100644
--- a/.gitlab/build_blueos.yml
+++ b/.gitlab/build_blueos.yml
@@ -45,7 +45,7 @@ blueos-clang_10_0_1-src:
     HOST_CONFIG: "lassen-blueos_3_ppc64le_ib_p9-${COMPILER}_cuda.cmake"
     EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON -DENABLE_DOCS=OFF"
     ALLOC_NODES: "1"
-    ALLOC_TIME: "30"
+    ALLOC_TIME: "45"
   extends: [.src_build_on_blueos, .with_cuda]
 
 # Note: to reduce duplication SPEC is not defined here, if we move to more than one
diff --git a/.gitlab/build_toss4.yml b/.gitlab/build_toss4.yml
index c4f83eabc..0b46b6e22 100644
--- a/.gitlab/build_toss4.yml
+++ b/.gitlab/build_toss4.yml
@@ -2,7 +2,7 @@
 # This is the shared configuration of jobs for toss4
 .on_toss4:
   variables:
-    SCHEDULER_PARAMETERS: "--res=ci --exclusive=user --deadline=now+1hour -N ${ALLOC_NODES} -t ${ALLOC_TIME} -A ${ALLOC_BANK}"
+    SCHEDULER_PARAMETERS: "--res=ci --exclusive=user --deadline=now+90minutes -N ${ALLOC_NODES} -t ${ALLOC_TIME} -A ${ALLOC_BANK}"
   tags:
     - batch
     - ruby
@@ -40,7 +40,7 @@ toss4-clang_14_0_6-src:
     EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON"
     DO_INTEGRATION_TESTS: "yes"
     ALLOC_NODES: "2"
-    ALLOC_TIME: "30"
+    ALLOC_TIME: "45"
   extends: .src_build_on_toss4
 
 toss4-gcc_10_3_1-src:
@@ -49,7 +49,7 @@ toss4-gcc_10_3_1-src:
     HOST_CONFIG: "ruby-toss_4_x86_64_ib-${COMPILER}.cmake"
     EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON"
     ALLOC_NODES: "1"
-    ALLOC_TIME: "30"
+    ALLOC_TIME: "45"
   extends: .src_build_on_toss4
 
 toss4-gcc_10_3_1-src-no-tribol:
@@ -58,7 +58,7 @@ toss4-gcc_10_3_1-src-no-tribol:
     HOST_CONFIG: "ruby-toss_4_x86_64_ib-${COMPILER}.cmake"
     EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON -UTRIBOL_DIR"
     ALLOC_NODES: "1"
-    ALLOC_TIME: "30"
+    ALLOC_TIME: "45"
   extends: .src_build_on_toss4
 
 toss4-gcc_10_3_1-src-no-optional-solvers:
@@ -67,7 +67,7 @@ toss4-gcc_10_3_1-src-no-optional-solvers:
     HOST_CONFIG: "ruby-toss_4_x86_64_ib-${COMPILER}.cmake"
     EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON -USUNDIALS_DIR -UPETSC_DIR"
     ALLOC_NODES: "1"
-    ALLOC_TIME: "20"
+    ALLOC_TIME: "45"
   extends: .src_build_on_toss4
 
 toss4-clang_14_0_6-full:
diff --git a/src/serac/infrastructure/accelerator.cpp b/src/serac/infrastructure/accelerator.cpp
index cdce5eba4..cfa670dd3 100644
--- a/src/serac/infrastructure/accelerator.cpp
+++ b/src/serac/infrastructure/accelerator.cpp
@@ -8,8 +8,6 @@
 
 #include <memory>
 
-#include "mfem.hpp"
-
 #include "serac/infrastructure/logger.hpp"
 
 namespace serac {
@@ -21,19 +19,23 @@ namespace {
 std::unique_ptr<mfem::Device> device;
 }  // namespace
 
-void initializeDevice()
+void initializeDevice(ExecutionSpace exec)
 {
   SLIC_ERROR_ROOT_IF(device, "serac::accelerator::initializeDevice cannot be called more than once");
   device = std::make_unique<mfem::Device>();
-#if defined(MFEM_USE_CUDA) && defined(SERAC_USE_CUDA_KERNEL_EVALUATION)
-  device->Configure("cuda");
+  if (exec == ExecutionSpace::GPU) {
+#if defined(MFEM_USE_CUDA)
+    device->Configure("cuda");
 #endif
+  }
 }
 
-void terminateDevice()
+void terminateDevice(ExecutionSpace exec)
 {
   // Idempotent, no adverse affects if called multiple times
-  device.reset();
+  if (exec == ExecutionSpace::GPU) {
+    device.reset();
+  }
 }
 
 }  // namespace accelerator
diff --git a/src/serac/infrastructure/accelerator.hpp b/src/serac/infrastructure/accelerator.hpp
index 180416d0d..b5c007d61 100644
--- a/src/serac/infrastructure/accelerator.hpp
+++ b/src/serac/infrastructure/accelerator.hpp
@@ -12,7 +12,6 @@
  */
 
 #pragma once
-
 #if defined(__CUDACC__)
 #define SERAC_HOST_DEVICE __host__ __device__
 #define SERAC_HOST __host__
@@ -49,7 +48,7 @@
  */
 #define SERAC_SUPPRESS_NVCC_HOSTDEVICE_WARNING
 #endif
-
+#include "RAJA/RAJA.hpp"
 #include <memory>
 
 #include "axom/core.hpp"
@@ -72,6 +71,40 @@ enum class ExecutionSpace
   Dynamic  // Corresponds to execution that can "legally" happen on either the host or device
 };
 
+/**
+ * @brief Helper struct used for deducing RAJA exectuion policies from serac::ExecutionSpace
+ */
+template <ExecutionSpace exec>
+struct EvaluationSpacePolicy;
+
+/// @overload
+template <>
+struct EvaluationSpacePolicy<ExecutionSpace::CPU> {
+  /// @brief Alias for loop policy.
+  using threads_t = RAJA::LoopPolicy<RAJA::seq_exec>;
+  /// @brief Alias for number of teams for CPU kernel launches.
+  using teams_t = RAJA::LoopPolicy<RAJA::seq_exec>;
+  /// @brief Alias for CPU kernel launch policy.
+  using launch_t = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+  /// @brief Alias for forall policy.
+  using forall_t = RAJA::seq_exec;
+};
+
+#if defined(__CUDACC__)
+/// @overload
+template <>
+struct EvaluationSpacePolicy<ExecutionSpace::GPU> {
+  /// @brief Alias for loop policy.
+  using threads_t = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
+  /// @brief Alias for number of teams for GPU kernel launches.
+  using teams_t = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+  /// @brief Alias for GPU kernel launch policy.
+  using launch_t = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+  /// @brief Alias for forall policy.
+  using forall_t = RAJA::cuda_exec<128>;
+};
+#endif
+
 /**
  * @brief The default execution space for serac builds
  */
@@ -88,25 +121,33 @@ struct execution_to_memory {
   static constexpr axom::MemorySpace value = axom::MemorySpace::Dynamic;
 };
 
-#ifdef SERAC_USE_UMPIRE
+/// @brief This helper is needed to suppress -Werror compilation errors caused by the
+/// explicit captures in the main execution lambdas.
+template <typename... T>
+SERAC_HOST_DEVICE void suppress_capture_warnings(T...)
+{
+}
+
 /// @overload
 template <>
 struct execution_to_memory<ExecutionSpace::CPU> {
+  /// @brief axom memory space corresponding to ExecutionSpace
   static constexpr axom::MemorySpace value = axom::MemorySpace::Host;
 };
 
 /// @overload
 template <>
 struct execution_to_memory<ExecutionSpace::GPU> {
+  /// @brief axom memory space corresponding to ExecutionSpace
   static constexpr axom::MemorySpace value = axom::MemorySpace::Device;
 };
 
 /// @overload
 template <>
 struct execution_to_memory<ExecutionSpace::Dynamic> {
+  /// @brief axom memory space corresponding to ExecutionSpace
   static constexpr axom::MemorySpace value = axom::MemorySpace::Unified;
 };
-#endif
 
 /// @brief Helper template for @p execution_to_memory trait
 template <ExecutionSpace space>
@@ -121,7 +162,7 @@ void zero_out(axom::Array<T, dim, space>& arr)
 #ifdef __CUDACC__
 /// @overload
 template <typename T, int dim>
-void zero_out(axom::Array<T, dim, execution_to_memory_v<ExecutionSpace::GPU>>& arr)
+void zero_out(axom::Array<T, dim, axom::MemorySpace::Device>& arr)
 {
   cudaMemset(arr.data(), 0, static_cast<std::size_t>(arr.size()) * sizeof(T));
 }
@@ -191,12 +232,12 @@ namespace accelerator {
  *
  * @note This function should only be called once
  */
-void initializeDevice();
+void initializeDevice(ExecutionSpace exec = ExecutionSpace::CPU);
 
 /**
  * @brief Cleans up the device, if applicable
  */
-void terminateDevice();
+void terminateDevice(ExecutionSpace exec = ExecutionSpace::CPU);
 
 #if defined(__CUDACC__)
 
diff --git a/src/serac/infrastructure/debug_print.hpp b/src/serac/infrastructure/debug_print.hpp
index 485f058af..edc618517 100644
--- a/src/serac/infrastructure/debug_print.hpp
+++ b/src/serac/infrastructure/debug_print.hpp
@@ -131,16 +131,18 @@ void printCUDAMemUsage()
 {
   int deviceCount = 0;
   cudaGetDeviceCount(&deviceCount);
-  int i = 0;
-  cudaSetDevice(i);
+  for (int i = 0; i < deviceCount; ++i) {
+    cudaSetDevice(i);
 
-  size_t freeBytes, totalBytes;
-  cudaMemGetInfo(&freeBytes, &totalBytes);
-  size_t usedBytes = totalBytes - freeBytes;
+    size_t freeBytes, totalBytes;
+    cudaMemGetInfo(&freeBytes, &totalBytes);
+    size_t usedBytes = totalBytes - freeBytes;
 
-  std::cout << "Device Number: " << i << std::endl;
-  std::cout << " Total Memory (MB): " << (totalBytes / 1024.0 / 1024.0) << std::endl;
-  std::cout << " Free Memory (MB): " << (freeBytes / 1024.0 / 1024.0) << std::endl;
-  std::cout << " Used Memory (MB): " << (usedBytes / 1024.0 / 1024.0) << std::endl;
+    std::cout << "Device Number: " << i << std::endl;
+    std::cout << " Total Memory (MB): " << (totalBytes / 1024.0 / 1024.0) << std::endl;
+    std::cout << " Free Memory (MB): " << (freeBytes / 1024.0 / 1024.0) << std::endl;
+    std::cout << " Used Memory (MB): " << (usedBytes / 1024.0 / 1024.0) << std::endl;
+  }
+  cudaSetDevice(0);
 }
 #endif
diff --git a/src/serac/numerics/functional/CMakeLists.txt b/src/serac/numerics/functional/CMakeLists.txt
index bd2f124f4..9d7f03e70 100644
--- a/src/serac/numerics/functional/CMakeLists.txt
+++ b/src/serac/numerics/functional/CMakeLists.txt
@@ -33,10 +33,9 @@ set(functional_headers
     tuple_tensor_dual_functions.hpp
     )
 
-set(functional_sources 
-    domain.cpp 
-    element_restriction.cpp 
-    geometric_factors.cpp 
+set(functional_sources
+    domain.cpp
+    element_restriction.cpp
     quadrature_data.cpp)
 
 set(functional_detail_headers
@@ -66,7 +65,7 @@ blt_list_append(TO functional_headers ELEMENTS ${functional_cuda_headers} IF ENA
 
 blt_add_library(
       NAME        serac_functional
-      HEADERS     ${functional_headers} ${functional_detail_headers} 
+      HEADERS     ${functional_headers} ${functional_detail_headers}
       SOURCES     ${functional_sources}
       DEPENDS_ON  ${functional_depends}
       )
diff --git a/src/serac/numerics/functional/boundary_integral_kernels.hpp b/src/serac/numerics/functional/boundary_integral_kernels.hpp
index 9af85ba8d..e896f60c4 100644
--- a/src/serac/numerics/functional/boundary_integral_kernels.hpp
+++ b/src/serac/numerics/functional/boundary_integral_kernels.hpp
@@ -5,12 +5,16 @@
 // SPDX-License-Identifier: (BSD-3-Clause)
 #pragma once
 
+#include <RAJA/pattern/launch/launch_core.hpp>
 #include <array>
 
+#include "serac/infrastructure/accelerator.hpp"
 #include "serac/serac_config.hpp"
 #include "serac/numerics/functional/quadrature_data.hpp"
 #include "serac/numerics/functional/differentiate_wrt.hpp"
 
+#include "RAJA/RAJA.hpp"
+
 namespace serac {
 
 namespace boundary_integral {
@@ -156,7 +160,7 @@ SERAC_HOST_DEVICE auto batch_apply_qf(lambda qf, double t, const tensor<double,
 }
 
 /// @trial_elements the element type for each trial space
-template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test_element,
+template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, ExecutionSpace exec, typename test_element,
           typename trial_element_type, typename lambda_type, typename derivative_type, int... indices>
 void evaluation_kernel_impl(trial_element_type trial_elements, test_element, double t,
                             const std::vector<const double*>& inputs, double* outputs, const double* positions,
@@ -165,43 +169,89 @@ void evaluation_kernel_impl(trial_element_type trial_elements, test_element, dou
 {
   // mfem provides this information as opaque arrays of doubles,
   // so we reinterpret the pointer with
-  constexpr int dim = dimension_of(geom) + 1;
-  constexpr int nqp = num_quadrature_points(geom, Q);
-  auto          J   = reinterpret_cast<const tensor<double, dim - 1, dim, nqp>*>(jacobians);
-  auto          x   = reinterpret_cast<const tensor<double, dim, nqp>*>(positions);
-  auto          r   = reinterpret_cast<typename test_element::dof_type*>(outputs);
-  static constexpr TensorProductQuadratureRule<Q> rule{};
+  constexpr int dim                          = dimension_of(geom) + 1;
+  constexpr int nqp                          = num_quadrature_points(geom, Q);
+  using X_Type                               = const tensor<double, dim, nqp>;
+  using J_Type                               = const tensor<double, dim - 1, dim, nqp>;
+  auto                                     J = reinterpret_cast<J_Type*>(jacobians);
+  auto                                     x = reinterpret_cast<X_Type*>(positions);
+  auto                                     r = reinterpret_cast<typename test_element::dof_type*>(outputs);
+  constexpr TensorProductQuadratureRule<Q> rule{};
 
-  static constexpr int qpts_per_elem = num_quadrature_points(geom, Q);
+  constexpr int qpts_per_elem = num_quadrature_points(geom, Q);
 
   [[maybe_unused]] tuple u = {
       reinterpret_cast<const typename decltype(type<indices>(trial_elements))::dof_type*>(inputs[indices])...};
 
+  using interpolate_out_type = decltype(tuple{get<indices>(trial_elements).template interpolate_output_helper<Q>()...});
+  using qf_inputs_type       = decltype(tuple{promote_each_to_dual_when_output_helper<indices == differentiation_index>(
+      get<indices>(trial_elements).template interpolate_output_helper<Q>())...});
+  // Determine allocation destination based on exec parameter
+  std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST";
+  auto&       rm          = umpire::ResourceManager::getInstance();
+  auto        allocator   = rm.getAllocator(device_name);
+  // Perform allocations
+  qf_inputs_type* qf_inputs = static_cast<qf_inputs_type*>(allocator.allocate(sizeof(qf_inputs_type) * num_elements));
+  interpolate_out_type* interpolate_result =
+      static_cast<interpolate_out_type*>(allocator.allocate(sizeof(interpolate_out_type) * num_elements));
+  // Zero out memory
+  rm.memset(qf_inputs, 0);
+  rm.memset(interpolate_result, 0);
+
+  auto e_range = RAJA::TypedRangeSegment<uint32_t>(0, num_elements);
   // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
-    // load the jacobians and positions for each quadrature point in this element
-    auto J_e = J[e];
-    auto x_e = x[e];
-
-    // batch-calculate values / derivatives of each trial space, at each quadrature point
-    [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when<indices == differentiation_index>(
-        get<indices>(trial_elements).interpolate(get<indices>(u)[elements[e]], rule))...};
-
-    // (batch) evalute the q-function at each quadrature point
-    auto qf_outputs = batch_apply_qf(qf, t, x_e, J_e, get<indices>(qf_inputs)...);
-
-    // write out the q-function derivatives after applying the
-    // physical_to_parent transformation, so that those transformations
-    // won't need to be applied in the action_of_gradient and element_gradient kernels
-    if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
-      for (int q = 0; q < leading_dimension(qf_outputs); q++) {
-        qf_derivatives[e * qpts_per_elem + uint32_t(q)] = get_gradient(qf_outputs[q]);
-      }
-    }
-
-    // (batch) integrate the material response against the test-space basis functions
-    test_element::integrate(get_value(qf_outputs), rule, &r[elements[e]]);
-  }
+  RAJA::launch<typename EvaluationSpacePolicy<exec>::launch_t>(
+      RAJA::LaunchParams(RAJA::Teams(static_cast<int>(num_elements)), RAJA::Threads(BLOCK_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<typename EvaluationSpacePolicy<exec>::teams_t>(
+            ctx, e_range,
+            // The explicit capture list is needed here because the capture occurs in a function
+            // template with a variadic non-type parameter.
+            [&ctx, t, J, x, u, qf, trial_elements, qpts_per_elem, rule, r, elements, qf_derivatives, qf_inputs,
+             interpolate_result](uint32_t e) {
+              // These casts are needed to suppres -Werror compilation errors
+              // caused by the explicit capture above.
+              (void)qpts_per_elem;
+              detail::suppress_capture_warnings(qf_derivatives, qpts_per_elem, trial_elements, qf_inputs,
+                                                interpolate_result, u);
+
+              // batch-calculate values / derivatives of each trial space, at each quadrature point
+              (get<indices>(trial_elements)
+                   .interpolate(get<indices>(u)[elements[e]], rule, &get<indices>(interpolate_result[e]), ctx),
+               ...);
+
+              ctx.teamSync();
+
+              (promote_each_to_dual_when<indices == differentiation_index, exec>(get<indices>(interpolate_result[e]),
+                                                                                 &get<indices>(qf_inputs[e]), ctx),
+               ...);
+
+              ctx.teamSync();
+
+              // (batch) evalute the q-function at each quadrature point
+              RAJA_TEAM_SHARED decltype(batch_apply_qf(qf, t, x[e], J[e], get<indices>(qf_inputs[e])...)) qf_outputs;
+              qf_outputs = batch_apply_qf(qf, t, x[e], J[e], get<indices>(qf_inputs[e])...);
+
+              ctx.teamSync();
+
+              // write out the q-function derivatives after applying the
+              // physical_to_parent transformation, so that those transformations
+              // won't need to be applied in the action_of_gradient and element_gradient kernels
+              if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
+                RAJA::RangeSegment x_range(0, leading_dimension(qf_outputs));
+                RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, x_range, [&](int q) {
+                  qf_derivatives[e * qpts_per_elem + uint32_t(q)] = get_gradient(qf_outputs[q]);
+                });
+              }
+
+              ctx.teamSync();
+
+              // (batch) integrate the material response against the test-space basis functions
+              test_element::integrate(get_value(qf_outputs), rule, &r[elements[e]], ctx);
+            });
+      });
+  allocator.deallocate(qf_inputs);
+  allocator.deallocate(interpolate_result);
 }
 
 //clang-format off
@@ -213,14 +263,14 @@ SERAC_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx)
 }
 //clang-format on
 
-template <typename derivative_type, int n, typename T>
-SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
+template <ExecutionSpace exec, typename derivative_type, int n, typename T>
+SERAC_HOST_DEVICE auto batch_apply_chain_rule(
+    derivative_type* qf_derivatives, const tensor<T, n>& inputs,
+    tensor<tuple<decltype(chain_rule(derivative_type{}, T{})), zero>, n>& outputs, const RAJA::LaunchContext& ctx)
 {
-  using return_type = decltype(chain_rule(derivative_type{}, T{}));
-  tensor<tuple<return_type, zero>, n> outputs{};
-  for (int i = 0; i < n; i++) {
-    get<0>(outputs[i]) = chain_rule(qf_derivatives[i], inputs[i]);
-  }
+  RAJA::RangeSegment i_range(0, n);
+  RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(
+      ctx, i_range, [&](int i) { get<0>(outputs[i]) = chain_rule(qf_derivatives[i], inputs[i]); });
   return outputs;
 }
 
@@ -249,31 +299,53 @@ SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, c
  * @see mfem::GeometricFactors
  * @param[in] num_elements The number of elements in the mesh
  */
-template <int Q, mfem::Geometry::Type geom, typename test, typename trial, typename derivatives_type>
+template <int Q, mfem::Geometry::Type geom, typename test, typename trial, ExecutionSpace exec,
+          typename derivatives_type>
 void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* qf_derivatives, const int* elements,
                                std::size_t num_elements)
 {
-  using test_element  = finite_element<geom, test>;
-  using trial_element = finite_element<geom, trial>;
+  using test_element   = finite_element<geom, test, exec>;
+  using trial_element  = finite_element<geom, trial, exec>;
+  using qf_inputs_type = decltype(trial_element::template interpolate_output_helper<Q>());
 
   // mfem provides this information in 1D arrays, so we reshape it
   // into strided multidimensional arrays before using
-  constexpr int                                   nqp = num_quadrature_points(geom, Q);
-  auto                                            du  = reinterpret_cast<const typename trial_element::dof_type*>(dU);
-  auto                                            dr  = reinterpret_cast<typename test_element::dof_type*>(dR);
-  static constexpr TensorProductQuadratureRule<Q> rule{};
+  constexpr int                            nqp         = num_quadrature_points(geom, Q);
+  std::string                              device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST";
+  constexpr TensorProductQuadratureRule<Q> rule{};
+
+  auto du      = reinterpret_cast<const typename trial_element::dof_type*>(dU);
+  auto dr      = reinterpret_cast<typename test_element::dof_type*>(dR);
+  auto e_range = RAJA::TypedRangeSegment<std::size_t>(0, num_elements);
+
+  auto&           rm        = umpire::ResourceManager::getInstance();
+  auto            allocator = rm.getAllocator(device_name);
+  qf_inputs_type* qf_inputs = static_cast<qf_inputs_type*>(allocator.allocate(sizeof(qf_inputs_type) * num_elements));
+  rm.memset(qf_inputs, 0);
+  // This typedef is needed to declare qf_outputs in shared memory.
+  using qf_outputs_type =
+      tensor<tuple<decltype(chain_rule(derivatives_type{}, typename trial_element::qf_input_type{})), zero>, nqp>;
 
   // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
-    // (batch) interpolate each quadrature point's value
-    auto qf_inputs = trial_element::interpolate(du[elements[e]], rule);
-
-    // (batch) evalute the q-function at each quadrature point
-    auto qf_outputs = batch_apply_chain_rule(qf_derivatives + e * nqp, qf_inputs);
-
-    // (batch) integrate the material response against the test-space basis functions
-    test_element::integrate(qf_outputs, rule, &dr[elements[e]]);
-  }
+  RAJA::launch<typename EvaluationSpacePolicy<exec>::launch_t>(
+      RAJA::LaunchParams(RAJA::Teams(static_cast<int>(num_elements)), RAJA::Threads(BLOCK_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<typename EvaluationSpacePolicy<exec>::teams_t>(ctx, e_range, [&](int e) {
+          // (batch) interpolate each quadrature point's value
+          trial_element::interpolate(du[elements[e]], rule, &(qf_inputs[e]), ctx);
+          ctx.teamSync();
+
+          // (batch) evalute the q-function at each quadrature point
+          RAJA_TEAM_SHARED qf_outputs_type qf_outputs;
+          batch_apply_chain_rule<exec>(qf_derivatives + e * nqp, qf_inputs[e], qf_outputs, ctx);
+          ctx.teamSync();
+
+          // (batch) integrate the material response against the test-space basis functions
+          test_element::integrate(qf_outputs, rule, &dr[elements[e]], ctx);
+          ctx.teamSync();
+        });
+      });
+  rm.deallocate(qf_inputs);
 }
 
 /**
@@ -297,65 +369,86 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q
  * @see mfem::GeometricFactors
  * @param[in] num_elements The number of elements in the mesh
  */
-template <mfem::Geometry::Type g, typename test, typename trial, int Q, typename derivatives_type>
-void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK, derivatives_type* qf_derivatives,
-                             const int* elements, std::size_t num_elements)
+template <mfem::Geometry::Type g, typename test, typename trial, int Q, ExecutionSpace exec, typename derivatives_type>
+void element_gradient_kernel(ExecArrayView<double, 3, exec> dK, derivatives_type* qf_derivatives, const int* elements,
+                             std::size_t num_elements)
 {
-  using test_element  = finite_element<g, test>;
-  using trial_element = finite_element<g, trial>;
+  using test_element  = finite_element<g, test, exec>;
+  using trial_element = finite_element<g, trial, exec>;
 
-  constexpr int nquad = num_quadrature_points(g, Q);
+  std::string    device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST";
+  constexpr bool is_QOI      = test::family == Family::QOI;
+  using padded_derivative_type [[maybe_unused]] =
+      std::conditional_t<is_QOI, tuple<derivatives_type, zero>, derivatives_type>;
 
-  static constexpr TensorProductQuadratureRule<Q> rule{};
+  RAJA::TypedRangeSegment<size_t>          elements_range(0, num_elements);
+  constexpr int                            nquad = num_quadrature_points(g, Q);
+  constexpr TensorProductQuadratureRule<Q> rule{};
 
   // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
-    auto* output_ptr = reinterpret_cast<typename test_element::dof_type*>(&dK(elements[e], 0, 0));
-
-    tensor<derivatives_type, nquad> derivatives{};
-    for (int q = 0; q < nquad; q++) {
-      derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)];
-    }
-
-    for (int J = 0; J < trial_element::ndof; J++) {
-      auto source_and_flux = trial_element::batch_apply_shape_fn(J, derivatives, rule);
-      test_element::integrate(source_and_flux, rule, output_ptr + J, trial_element::ndof);
-    }
-  }
+  RAJA::launch<typename EvaluationSpacePolicy<exec>::launch_t>(
+      RAJA::LaunchParams(RAJA::Teams(static_cast<int>(num_elements)), RAJA::Threads(BLOCK_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<typename EvaluationSpacePolicy<exec>::teams_t>(
+            ctx, elements_range, [&ctx, dK, elements, qf_derivatives, nquad, rule](uint32_t e) {
+              (void)nquad;
+              auto* output_ptr = reinterpret_cast<typename test_element::dof_type*>(&dK(elements[e], 0, 0));
+
+              RAJA_TEAM_SHARED tensor<derivatives_type, nquad> derivatives;
+              RAJA::RangeSegment                               x_range(0, nquad);
+              RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(
+                  ctx, x_range, [&](int q) { derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)]; });
+
+              ctx.teamSync();
+
+              RAJA_TEAM_SHARED
+              typename trial_element::template batch_apply_shape_fn_output<derivatives_type, Q>::type source_and_flux;
+              for (int J = 0; J < trial_element::ndof; J++) {
+                trial_element::batch_apply_shape_fn(J, derivatives, &source_and_flux, rule, ctx);
+                ctx.teamSync();
+
+                test_element::integrate(source_and_flux, rule, output_ptr + J, ctx, trial_element::ndof);
+                ctx.teamSync();
+              }
+            });
+      });
 }
 
-template <uint32_t wrt, int Q, mfem::Geometry::Type geom, typename signature, typename lambda_type,
+template <uint32_t wrt, int Q, mfem::Geometry::Type geom, ExecutionSpace exec, typename signature, typename lambda_type,
           typename derivative_type>
 auto evaluation_kernel(signature s, lambda_type qf, const double* positions, const double* jacobians,
                        std::shared_ptr<derivative_type> qf_derivatives, const int* elements, uint32_t num_elements)
 {
-  auto trial_elements = trial_elements_tuple<geom>(s);
-  auto test_element   = get_test_element<geom>(s);
+  auto trial_elements = trial_elements_tuple<geom, exec>(s);
+  auto test_element   = get_test_element<geom, exec>(s);
   return [=](double time, const std::vector<const double*>& inputs, double* outputs, bool /* update state */) {
-    evaluation_kernel_impl<wrt, Q, geom>(trial_elements, test_element, time, inputs, outputs, positions, jacobians, qf,
-                                         qf_derivatives.get(), elements, num_elements, s.index_seq);
+    evaluation_kernel_impl<wrt, Q, geom, exec>(trial_elements, test_element, time, inputs, outputs, positions,
+                                               jacobians, qf, qf_derivatives.get(), elements, num_elements,
+                                               s.index_seq);
   };
 }
 
-template <int wrt, int Q, mfem::Geometry::Type geom, typename signature, typename derivative_type>
+template <int wrt, int Q, mfem::Geometry::Type geom, ExecutionSpace exec, typename signature, typename derivative_type>
 std::function<void(const double*, double*)> jacobian_vector_product_kernel(
     signature, std::shared_ptr<derivative_type> qf_derivatives, const int* elements, uint32_t num_elements)
 {
   return [=](const double* du, double* dr) {
     using test_space  = typename signature::return_type;
     using trial_space = typename std::tuple_element<wrt, typename signature::parameter_types>::type;
-    action_of_gradient_kernel<Q, geom, test_space, trial_space>(du, dr, qf_derivatives.get(), elements, num_elements);
+    action_of_gradient_kernel<Q, geom, test_space, trial_space, exec>(du, dr, qf_derivatives.get(), elements,
+                                                                      num_elements);
   };
 }
 
-template <int wrt, int Q, mfem::Geometry::Type geom, typename signature, typename derivative_type>
-std::function<void(ExecArrayView<double, 3, ExecutionSpace::CPU>)> element_gradient_kernel(
+template <int wrt, int Q, mfem::Geometry::Type geom, ExecutionSpace exec, typename signature, typename derivative_type>
+std::function<void(ExecArrayView<double, 3, exec>)> element_gradient_kernel(
     signature, std::shared_ptr<derivative_type> qf_derivatives, const int* elements, uint32_t num_elements)
 {
-  return [=](ExecArrayView<double, 3, ExecutionSpace::CPU> K_elem) {
+  return [=](ExecArrayView<double, 3, exec> K_elem) {
     using test_space  = typename signature::return_type;
     using trial_space = typename std::tuple_element<wrt, typename signature::parameter_types>::type;
-    element_gradient_kernel<geom, test_space, trial_space, Q>(K_elem, qf_derivatives.get(), elements, num_elements);
+    element_gradient_kernel<geom, test_space, trial_space, Q, exec>(K_elem, qf_derivatives.get(), elements,
+                                                                    num_elements);
   };
 }
 
diff --git a/src/serac/numerics/functional/detail/hexahedron_H1.inl b/src/serac/numerics/functional/detail/hexahedron_H1.inl
index c28b15fa0..10a538368 100644
--- a/src/serac/numerics/functional/detail/hexahedron_H1.inl
+++ b/src/serac/numerics/functional/detail/hexahedron_H1.inl
@@ -10,14 +10,18 @@
  * @brief Specialization of finite_element for H1 on hexahedron geometry
  */
 
+#include "RAJA/RAJA.hpp"
+#include "serac/infrastructure/accelerator.hpp"
+#include "serac/numerics/functional/tensor.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
 // note: mfem assumes the parent element domain is [0,1]x[0,1]x[0,1]
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
+template <int p, int c, serac::ExecutionSpace exec>
+struct finite_element<mfem::Geometry::CUBE, H1<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::CUBE;
   static constexpr auto family     = Family::H1;
   static constexpr int  components = c;
@@ -31,11 +35,18 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
 
   using dof_type = tensor<double, c, p + 1, p + 1, p + 1>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
+    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
+    using type     = tensor<tuple<source_t, flux_t>, q * q * q>;
+  };
+
   SERAC_HOST_DEVICE static constexpr tensor<double, ndof> shape_functions(tensor<double, dim> xi)
   {
     auto N_xi   = GaussLobattoInterpolation<p + 1>(xi[0]);
@@ -72,7 +83,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
       for (int j = 0; j < p + 1; j++) {
         for (int i = 0; i < p + 1; i++) {
           dN[count++] = {
-            dN_xi[i] *  N_eta[j] *  N_zeta[k], 
+            dN_xi[i] *  N_eta[j] *  N_zeta[k],
              N_xi[i] * dN_eta[j] *  N_zeta[k],
              N_xi[i] *  N_eta[j] * dN_zeta[k]
           };
@@ -130,7 +141,9 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
+  static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext ctx)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -140,34 +153,34 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
     int jy = (j % (n * n)) / n;
     int jz = j / (n * n);
 
-    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
-    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
-
-    tensor<tuple<source_t, flux_t>, q * q * q> output;
+    auto x_range = RAJA::RangeSegment(0, q * q * q);
+    RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, x_range, [&](int Q) {
+      int qx = Q % q;
+      int qy = (Q / q) % q;
+      int qz = (Q / (q * q));
 
-    for (int qz = 0; qz < q; qz++) {
-      for (int qy = 0; qy < q; qy++) {
-        for (int qx = 0; qx < q; qx++) {
-          double              phi_j      = B(qx, jx) * B(qy, jy) * B(qz, jz);
-          tensor<double, dim> dphi_j_dxi = {G(qx, jx) * B(qy, jy) * B(qz, jz), B(qx, jx) * G(qy, jy) * B(qz, jz),
-                                            B(qx, jx) * B(qy, jy) * G(qz, jz)};
+      double              phi_j      = B(qx, jx) * B(qy, jy) * B(qz, jz);
+      tensor<double, dim> dphi_j_dxi = {G(qx, jx) * B(qy, jy) * B(qz, jz), B(qx, jx) * G(qy, jy) * B(qz, jz),
+                                        B(qx, jx) * B(qy, jy) * G(qz, jz)};
 
-          int   Q   = (qz * q + qy) * q + qx;
-          auto& d00 = get<0>(get<0>(input(Q)));
-          auto& d01 = get<1>(get<0>(input(Q)));
-          auto& d10 = get<0>(get<1>(input(Q)));
-          auto& d11 = get<1>(get<1>(input(Q)));
+      auto& d00 = get<0>(get<0>(input(Q)));
+      auto& d01 = get<1>(get<0>(input(Q)));
+      auto& d10 = get<0>(get<1>(input(Q)));
+      auto& d11 = get<1>(get<1>(input(Q)));
 
-          output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
-        }
-      }
-    }
+      (*output)[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
+    });
+  }
 
-    return output;
+  template <int q>
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, q * q * q>{};
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static void interpolate(const dof_type&                   X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q * q * q>* output_ptr, RAJA::LaunchContext ctx)
   {
     // we want to compute the following:
     //
@@ -176,7 +189,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
     // where
     //   X_q(u, v, w) are the quadrature-point values at position {u, v, w},
     //   B(u, i) is the i^{th} 1D interpolation/differentiation (shape) function,
-    //           evaluated at the u^{th} 1D quadrature point, and
+    //           evaluated at the u^{th} 1D quadrature point, andintegrate
     //   X_e(i, j, k) are the values at node {i, j, k} to be interpolated
     //
     // this algorithm carries out the above calculation in 3 steps:
@@ -184,97 +197,224 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
     // A1(dz, dy, qx)  := B(qx, dx) * X_e(dz, dy, dx)
     // A2(dz, qy, qx)  := B(qy, dy) * A1(dz, dy, qx)
     // X_q(qz, qy, qx) := B(qz, dz) * A2(dz, qy, qx)
+    using threads_t                     = typename EvaluationSpacePolicy<exec>::threads_t;
     static constexpr bool apply_weights = false;
-    static constexpr auto B             = calculate_B<apply_weights, q>();
-    static constexpr auto G             = calculate_G<apply_weights, q>();
-
-    tensor<double, c, q, q, q>      value{};
-    tensor<double, c, dim, q, q, q> gradient{};
 
+    RAJA::RangeSegment x_range(0, BLOCK_SZ);
+    RAJA_TEAM_SHARED tensor<double, c, q, q, q> value;
+    RAJA_TEAM_SHARED tensor<double, c, dim, q, q, q> gradient;
+    constexpr auto                                   B = calculate_B<apply_weights, q>();
+    constexpr auto                                   G = calculate_G<apply_weights, q>();
+    RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+      int qx = tid % BLOCK_X;
+      int qy = (tid / BLOCK_X) % BLOCK_Y;
+      int qz = tid / (BLOCK_X * BLOCK_Y);
+
+      memset_tensor(value, 0.0, qx, qy, qz);
+      memset_tensor(gradient, 0.0, qx, qy, qz);
+    });
+    ctx.teamSync();
     for (int i = 0; i < c; i++) {
-      auto A10 = contract<2, 1>(X[i], B);
-      auto A11 = contract<2, 1>(X[i], G);
-
-      auto A20 = contract<1, 1>(A10, B);
-      auto A21 = contract<1, 1>(A11, B);
-      auto A22 = contract<1, 1>(A10, G);
-
-      value(i)       = contract<0, 1>(A20, B);
-      gradient(i, 0) = contract<0, 1>(A21, B);
-      gradient(i, 1) = contract<0, 1>(A22, B);
-      gradient(i, 2) = contract<0, 1>(A20, G);
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 1>(X[i], B)) A10;
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 1>(X[i], G)) A11;
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A10, B))  A20;
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A11, B))  A21;
+      RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A10, G))  A22;
+
+      RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+        int qx = tid % BLOCK_X;
+        int qy = (tid / BLOCK_X) % BLOCK_Y;
+        int qz = tid / (BLOCK_X * BLOCK_Y);
+        memset_tensor(A20, 0.0, qx, qy, qz);
+        memset_tensor(A21, 0.0, qx, qy, qz);
+        memset_tensor(A22, 0.0, qx, qy, qz);
+        memset_tensor(A10, 0.0, qx, qy, qz);
+        memset_tensor(A11, 0.0, qx, qy, qz);
+        ctx.teamSync();
+      });
+      RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+        int qx = tid % BLOCK_X;
+        int qy = (tid / BLOCK_X) % BLOCK_Y;
+        int qz = tid / (BLOCK_X * BLOCK_Y);
+        // Perform actual contractions
+        contract<2, 1>(X[i], B, &A10, qx, qy, qz);
+        contract<2, 1>(X[i], G, &A11, qx, qy, qz);
+        ctx.teamSync();
+      });
+
+      RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+        int qx = tid % BLOCK_X;
+        int qy = (tid / BLOCK_X) % BLOCK_Y;
+        int qz = tid / (BLOCK_X * BLOCK_Y);
+
+        contract<1, 1>(A10, B, &A20, qx, qy, qz);
+        contract<1, 1>(A11, B, &A21, qx, qy, qz);
+        contract<1, 1>(A10, G, &A22, qx, qy, qz);
+        ctx.teamSync();
+      });
+
+      RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+        int qx = tid % BLOCK_X;
+        int qy = (tid / BLOCK_X) % BLOCK_Y;
+        int qz = tid / (BLOCK_X * BLOCK_Y);
+
+        contract<0, 1>(A20, B, &value(i), qx, qy, qz);
+        contract<0, 1>(A21, B, &gradient(i, 0), qx, qy, qz);
+        contract<0, 1>(A22, B, &gradient(i, 1), qx, qy, qz);
+        contract<0, 1>(A20, G, &gradient(i, 2), qx, qy, qz);
+        ctx.teamSync();
+      });
     }
 
     // transpose the quadrature data into a flat tensor of tuples
+
+    RAJA_TEAM_SHARED
     union {
-      tensor<qf_input_type, q * q * q>                                   one_dimensional;
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, q, q, q> three_dimensional;
+      tensor<qf_input_type, q * q * q>                                  one_dimensional;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, q, q, q> three_dimensional;
     } output;
 
-    for (int qz = 0; qz < q; qz++) {
-      for (int qy = 0; qy < q; qy++) {
-        for (int qx = 0; qx < q; qx++) {
-          for (int i = 0; i < c; i++) {
-            get<VALUE>(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx);
-            for (int j = 0; j < dim; j++) {
-              get<GRADIENT>(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx);
-            }
-          }
+    RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+      if (tid >= q * q * q) {
+        return;
+      }
+      int qx = tid % q;
+      int qy = (tid / q) % q;
+      int qz = tid / (q * q);
+
+      for (int i = 0; i < c; i++) {
+        get<VALUE>(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx);
+        for (int j = 0; j < dim; j++) {
+          get<GRADIENT>(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx);
         }
       }
+    });
+    ctx.teamSync();
+    if (output_ptr) {
+      RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.one_dimensional)) {
+          ((*output_ptr))[tid] = output.one_dimensional[tid];
+        }
+      });
     }
-
-    return output.one_dimensional;
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          int step = 1)
+                                          RAJA::LaunchContext ctx, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
     }
 
     constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
+    using threads_t      = typename EvaluationSpacePolicy<exec>::threads_t;
+    using s_buffer_type  = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q, q>>;
+    using f_buffer_type  = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q, q>>;
 
-    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q, q> >;
-    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q, q> >;
+    /*static*/ constexpr bool apply_weights = true;
 
-    static constexpr bool apply_weights = true;
-    static constexpr auto B             = calculate_B<apply_weights, q>();
-    static constexpr auto G             = calculate_G<apply_weights, q>();
+    RAJA::RangeSegment x_range(0, BLOCK_SZ);
+
+    constexpr auto B = calculate_B<apply_weights, q>();
+    constexpr auto G = calculate_G<apply_weights, q>();
 
+    RAJA_TEAM_SHARED s_buffer_type source;
+    RAJA_TEAM_SHARED f_buffer_type flux;
+
+    // The binary is_zero_and type will select which of its binary operands is nonzero, and return it
+    using t1 = decltype(deduce_contract_return_type<2, 0>(source, B));
+    using t2 = decltype(deduce_contract_return_type<2, 0>(flux(0), G));
+
+    RAJA_TEAM_SHARED typename is_zero_and<t1, t2>::type                      A20;
+    RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(flux(1), B)) A21;
+    RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(flux(2), B)) A22;
+
+    using t3 = decltype(deduce_contract_return_type<1, 0>(A20, B));
+    using t4 = decltype(deduce_contract_return_type<1, 0>(A21, G));
+    RAJA_TEAM_SHARED typename is_zero_and<t3, t4>::type                  A10;
+    RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 0>(A22, B)) A11;
     for (int j = 0; j < ntrial; j++) {
       for (int i = 0; i < c; i++) {
-        s_buffer_type source;
-        f_buffer_type flux;
-
-        for (int qz = 0; qz < q; qz++) {
-          for (int qy = 0; qy < q; qy++) {
-            for (int qx = 0; qx < q; qx++) {
-              int Q = (qz * q + qy) * q + qx;
-              if constexpr (!is_zero<source_type>{}) {
-                source(qz, qy, qx) = reinterpret_cast<const double*>(&get<SOURCE>(qf_output[Q]))[i * ntrial + j];
-              }
-              if constexpr (!is_zero<flux_type>{}) {
-                for (int k = 0; k < dim; k++) {
-                  flux(k, qz, qy, qx) =
-                      reinterpret_cast<const double*>(&get<FLUX>(qf_output[Q]))[(i * dim + k) * ntrial + j];
-                }
-              }
-            }
+        ctx.teamSync();
+        RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+          if (tid >= q * q * q) {
+            return;
+          }
+          int qx = tid % q;
+          int qy = (tid / q) % q;
+          int qz = tid / (q * q);
+          if constexpr (!is_zero<source_type>{}) {
+            source(qz, qy, qx) = reinterpret_cast<const double*>(&get<SOURCE>(qf_output[tid]))[i * ntrial + j];
           }
+        });
+        for (int k = 0; k < dim; k++) {
+          ctx.teamSync();
+          RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+            if (tid >= q * q * q) {
+              return;
+            }
+            int qx = tid % q;
+            int qy = (tid / q) % q;
+            int qz = tid / (q * q);
+            int Q  = (qz * q + qy) * q + qx;
+            if constexpr (!is_zero<flux_type>{}) {
+              flux(k, qz, qy, qx) =
+                  reinterpret_cast<const double*>(&get<FLUX>(qf_output[Q]))[(i * dim + k) * ntrial + j];
+            }
+          });
         }
-
-        auto A20 = contract<2, 0>(source, B) + contract<2, 0>(flux(0), G);
-        auto A21 = contract<2, 0>(flux(1), B);
-        auto A22 = contract<2, 0>(flux(2), B);
-
-        auto A10 = contract<1, 0>(A20, B) + contract<1, 0>(A21, G);
-        auto A11 = contract<1, 0>(A22, B);
-
-        element_residual[j * step](i) += contract<0, 0>(A10, B) + contract<0, 0>(A11, G);
+        ctx.teamSync();
+
+        RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+          int qx = tid % BLOCK_X;
+          int qy = (tid / BLOCK_X) % BLOCK_Y;
+          int qz = tid / (BLOCK_X * BLOCK_Y);
+
+          memset_tensor(A20, 0.0, qx, qy, qz);
+          memset_tensor(A21, 0.0, qx, qy, qz);
+          memset_tensor(A22, 0.0, qx, qy, qz);
+          memset_tensor(A10, 0.0, qx, qy, qz);
+          memset_tensor(A11, 0.0, qx, qy, qz);
+          ctx.teamSync();
+        });
+
+        RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+          int qx = tid % BLOCK_X;
+          int qy = (tid / BLOCK_X) % BLOCK_Y;
+          int qz = tid / (BLOCK_X * BLOCK_Y);
+          contract<2, 0>(source, B, &A20, qx, qy, qz);
+          ctx.teamSync();
+          contract<2, 0>(flux(0), G, &A20, qx, qy, qz, true);
+          ctx.teamSync();
+
+          contract<2, 0>(flux(1), B, &A21, qx, qy, qz);
+          contract<2, 0>(flux(2), B, &A22, qx, qy, qz);
+          ctx.teamSync();
+        });
+
+        RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+          int qx = tid % BLOCK_X;
+          int qy = (tid / BLOCK_X) % BLOCK_Y;
+          int qz = tid / (BLOCK_X * BLOCK_Y);
+          contract<1, 0>(A21, G, &A10, qx, qy, qz);
+          ctx.teamSync();
+          contract<1, 0>(A20, B, &A10, qx, qy, qz, true);
+          ctx.teamSync();
+          contract<1, 0>(A22, B, &A11, qx, qy, qz);
+          ctx.teamSync();
+        });
+
+        RAJA::loop<threads_t>(ctx, x_range, [&](int tid) {
+          int qx = tid % BLOCK_X;
+          int qy = (tid / BLOCK_X) % BLOCK_Y;
+          int qz = tid / (BLOCK_X * BLOCK_Y);
+          contract<0, 0>(A10, B, &(element_residual[j * step](i)), qx, qy, qz, true);
+          ctx.teamSync();
+          contract<0, 0>(A11, G, &(element_residual[j * step](i)), qx, qy, qz, true);
+          ctx.teamSync();
+        });
       }
     }
   }
@@ -282,7 +422,7 @@ struct finite_element<mfem::Geometry::CUBE, H1<p, c> > {
 #if 0
 
   template <int q>
-  static SERAC_DEVICE auto interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
+  static SERAC_DEVICE void interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
                                        const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache)
   {
     // we want to compute the following:
diff --git a/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl b/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl
index 36fdfd3d2..a4c3a8859 100644
--- a/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl
+++ b/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl
@@ -10,6 +10,8 @@
  * @brief Specialization of finite_element for Hcurl on hexahedron geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their curls) that
 // interpolate at Gauss-Lobatto nodes for closed intervals, and Gauss-Legendre
 // nodes for open intervals.
@@ -19,8 +21,8 @@
 //         see quadrilateral_hcurl.inl for more information
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p>
-struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
+template <int p, serac::ExecutionSpace exec>
+struct finite_element<mfem::Geometry::CUBE, Hcurl<p>, exec> {
   static constexpr auto geometry   = mfem::Geometry::CUBE;
   static constexpr auto family     = Family::HCURL;
   static constexpr int  dim        = 3;
@@ -35,6 +37,14 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
   using residual_type =
       typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(dot(get<0>(get<0>(in_t{})), vec3{}) + dot(get<1>(get<0>(in_t{})), vec3{}));
+    using flux_t   = decltype(dot(get<0>(get<1>(in_t{})), vec3{}) + dot(get<1>(get<1>(in_t{})), vec3{}));
+
+    using type = tensor<tuple<source_t, flux_t>, q * q * q>;
+  };
+
   // this is how mfem provides the data to us for these elements
   struct dof_type {
     tensor<double, p + 1, p + 1, p> x;
@@ -255,7 +265,9 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
+  static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
     constexpr bool                     apply_weights = false;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -287,11 +299,6 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
         break;
     }
 
-    using source_t = decltype(dot(get<0>(get<0>(in_t{})), vec3{}) + dot(get<1>(get<0>(in_t{})), vec3{}));
-    using flux_t   = decltype(dot(get<0>(get<1>(in_t{})), vec3{}) + dot(get<1>(get<1>(in_t{})), vec3{}));
-
-    tensor<tuple<source_t, flux_t>, q * q * q> output;
-
     for (int qz = 0; qz < q; qz++) {
       for (int qy = 0; qy < q; qy++) {
         for (int qx = 0; qx < q; qx++) {
@@ -333,7 +340,15 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
+  static auto interpolate_output_helper()
+  {
+    return tensor<tuple<tensor<double, 3>, tensor<double, 3>>, q * q * q>{};
+  }
+
+  template <int q>
+  SERAC_HOST_DEVICE static void interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&,
+                                            tensor<tuple<tensor<double, 3>, tensor<double, 3>>, q * q * q>* output_ptr,
+                                            RAJA::LaunchContext)
   {
     constexpr bool                     apply_weights = false;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -375,28 +390,24 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
     }
     // clang-format on
 
-    tensor<tuple<tensor<double, 3>, tensor<double, 3>>, q * q * q> qf_inputs;
-
     int count = 0;
     for (int qz = 0; qz < q; qz++) {
       for (int qy = 0; qy < q; qy++) {
         for (int qx = 0; qx < q; qx++) {
           for (int i = 0; i < 3; i++) {
-            get<VALUE>(qf_inputs(count))[i] = value[i](qz, qy, qx);
-            get<CURL>(qf_inputs(count))[i]  = curl[i](qz, qy, qx);
+            get<VALUE>((*output_ptr)(count))[i] = value[i](qz, qy, qx);
+            get<CURL>((*output_ptr)(count))[i]  = curl[i](qz, qy, qx);
           }
           count++;
         }
       }
     }
-
-    return qf_inputs;
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          [[maybe_unused]] int step = 1)
+                                          RAJA::LaunchContext, [[maybe_unused]] int        step = 1)
   {
     constexpr bool                     apply_weights = true;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -444,8 +455,8 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
       element_residual[0].y += contract< y, 0 >(A1, B1);
     }
 
-    //  r(2, dz, dy, dx) = s(2, qz, qy, qx) * B1(qz, dz) * B2(qy, dy) * B2(qx, dx) 
-    //                   + f(0, qz, qy, qx) * B1(qz, dz) * G2(qy, dy) * B2(qx, dx) 
+    //  r(2, dz, dy, dx) = s(2, qz, qy, qx) * B1(qz, dz) * B2(qy, dy) * B2(qx, dx)
+    //                   + f(0, qz, qy, qx) * B1(qz, dz) * G2(qy, dy) * B2(qx, dx)
     //                   - f(1, qz, qy, qx) * B1(qz, dz) * B2(qy, dy) * G2(qx, dx);
     {
       auto A20 = contract< y, 0 >(source[2], B2) + contract< y, 0 >(flux[0], G2);
@@ -461,7 +472,7 @@ struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
 #if 0
 
   template <int q>
-  static SERAC_DEVICE auto interpolate(const dof_type& element_values, const tensor<double, dim, dim>& J,
+  static SERAC_DEVICE void interpolate(const dof_type& element_values, const tensor<double, dim, dim>& J,
                                        const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache)
   {
     int tidx = threadIdx.x % q;
diff --git a/src/serac/numerics/functional/detail/hexahedron_L2.inl b/src/serac/numerics/functional/detail/hexahedron_L2.inl
index 4a75d77b3..7a51f6f7c 100644
--- a/src/serac/numerics/functional/detail/hexahedron_L2.inl
+++ b/src/serac/numerics/functional/detail/hexahedron_L2.inl
@@ -10,14 +10,16 @@
  * @brief Specialization of finite_element for L2 on hexahedron geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
 // note: mfem assumes the parent element domain is [0,1]x[0,1]x[0,1]
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
+template <int p, int c, serac::ExecutionSpace exec>
+struct finite_element<mfem::Geometry::CUBE, L2<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::CUBE;
   static constexpr auto family     = Family::L2;
   static constexpr int  components = c;
@@ -31,15 +33,23 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
 
   // TODO: remove this
   using residual_type =
-      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
+      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
 
   using dof_type = tensor<double, c, p + 1, p + 1, p + 1>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
+    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
+
+    using type = tensor<tuple<source_t, flux_t>, q * q * q>;
+  };
+
   SERAC_HOST_DEVICE static constexpr tensor<double, ndof> shape_functions(tensor<double, dim> xi)
   {
     auto N_xi   = GaussLobattoInterpolation<p + 1>(xi[0]);
@@ -76,7 +86,7 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
       for (int j = 0; j < p + 1; j++) {
         for (int i = 0; i < p + 1; i++) {
           dN[count++] = {
-            dN_xi[i] *  N_eta[j] *  N_zeta[k], 
+            dN_xi[i] *  N_eta[j] *  N_zeta[k],
              N_xi[i] * dN_eta[j] *  N_zeta[k],
              N_xi[i] *  N_eta[j] * dN_zeta[k]
           };
@@ -134,7 +144,9 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
+  static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -144,11 +156,6 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
     int jy = (j % (n * n)) / n;
     int jz = j / (n * n);
 
-    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
-    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
-
-    tensor<tuple<source_t, flux_t>, q * q * q> output;
-
     for (int qz = 0; qz < q; qz++) {
       for (int qy = 0; qy < q; qy++) {
         for (int qx = 0; qx < q; qx++) {
@@ -162,16 +169,21 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
           auto& d10 = get<0>(get<1>(input(Q)));
           auto& d11 = get<1>(get<1>(input(Q)));
 
-          output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
+          (*output)[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
         }
       }
     }
+  }
 
-    return output;
+  template <int q>
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, q * q * q>{};
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static void interpolate(const dof_type&                   X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q * q * q>* output_ptr, RAJA::LaunchContext ctx)
   {
     // we want to compute the following:
     //
@@ -211,8 +223,8 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
 
     // transpose the quadrature data into a flat tensor of tuples
     union {
-      tensor<qf_input_type, q * q * q>                                   one_dimensional;
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, q, q, q> three_dimensional;
+      tensor<qf_input_type, q * q * q>                                  one_dimensional;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, q, q, q> three_dimensional;
     } output;
 
     for (int qz = 0; qz < q; qz++) {
@@ -228,13 +240,20 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
       }
     }
 
-    return output.one_dimensional;
+    RAJA::TypedRangeSegment<int> x_range(0, BLOCK_SZ);
+    if (output_ptr) {
+      RAJA::loop<RAJA::LoopPolicy<RAJA::seq_exec>>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.one_dimensional)) {
+          ((*output_ptr))[tid] = output.one_dimensional[tid];
+        }
+      });
+    }
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          int step = 1)
+                                          RAJA::LaunchContext, int                         step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
@@ -242,8 +261,8 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
 
     constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
 
-    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q, q> >;
-    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q, q> >;
+    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q, q>>;
+    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q, q>>;
 
     static constexpr bool apply_weights = true;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -286,7 +305,7 @@ struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
 #if 0
 
   template <int q>
-  static SERAC_DEVICE auto interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
+  static SERAC_DEVICE void interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
                                        const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache)
   {
     // we want to compute the following:
diff --git a/src/serac/numerics/functional/detail/qoi.inl b/src/serac/numerics/functional/detail/qoi.inl
index 33e747ea4..3b0fb9510 100644
--- a/src/serac/numerics/functional/detail/qoi.inl
+++ b/src/serac/numerics/functional/detail/qoi.inl
@@ -9,9 +9,13 @@
  *
  * @brief Specialization of finite_element for expressing quantities of interest on any geometry
  */
+
+#include "RAJA/RAJA.hpp"
+#include "serac/infrastructure/accelerator.hpp"
+
 /// @cond
-template <mfem::Geometry::Type g>
-struct finite_element<g, QOI> {
+template <mfem::Geometry::Type g, serac::ExecutionSpace exec>
+struct finite_element<g, QOI, exec> {
   static constexpr auto geometry   = g;
   static constexpr auto family     = Family::QOI;
   static constexpr int  components = 1;
@@ -25,14 +29,14 @@ struct finite_element<g, QOI> {
 
   template <int Q, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<zero, Q>&, const TensorProductQuadratureRule<q>&, dof_type*,
-                                          [[maybe_unused]] int step = 1)
+                                          RAJA::LaunchContext, [[maybe_unused]] int step = 1)
   {
     return;  // integrating zeros is a no-op
   }
 
   template <int Q, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<double, Q>& qf_output, const TensorProductQuadratureRule<q>&,
-                                          dof_type* element_total, [[maybe_unused]] int step = 1)
+                                          dof_type* element_total, RAJA::LaunchContext, [[maybe_unused]] int step = 1)
   {
     if constexpr (geometry == mfem::Geometry::SEGMENT) {
       static_assert(Q == q);
@@ -79,7 +83,7 @@ struct finite_element<g, QOI> {
   template <typename source_type, int Q, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<serac::tuple<source_type, zero>, Q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_total,
-                                          [[maybe_unused]] int step = 1)
+                                          RAJA::LaunchContext, [[maybe_unused]] int        step = 1)
   {
     if constexpr (is_zero<source_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/quadrilateral_H1.inl b/src/serac/numerics/functional/detail/quadrilateral_H1.inl
index 312f8e9c7..c9ed1390f 100644
--- a/src/serac/numerics/functional/detail/quadrilateral_H1.inl
+++ b/src/serac/numerics/functional/detail/quadrilateral_H1.inl
@@ -10,14 +10,17 @@
  * @brief Specialization of finite_element for H1 on quadrilateral geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
 // note: mfem assumes the parent element domain is [0,1]x[0,1]
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
+
+template <int p, int c, serac::ExecutionSpace exec>
+struct finite_element<mfem::Geometry::SQUARE, H1<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::SQUARE;
   static constexpr auto family     = Family::H1;
   static constexpr int  components = c;
@@ -30,15 +33,23 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
   static constexpr int SOURCE = 0, FLUX = 1;
 
   using residual_type =
-      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
+      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
 
   using dof_type = tensor<double, c, p + 1, p + 1>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, 2>{}));
+    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, 2>{}));
+
+    using type = tensor<tuple<source_t, flux_t>, q * q>;
+  };
+
   /*
 
     interpolation nodes and their associated numbering:
@@ -155,7 +166,9 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q> input, const TensorProductQuadratureRule<q>&)
+  static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -164,11 +177,6 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
     int jx = j % n;
     int jy = j / n;
 
-    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, 2>{}));
-    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, 2>{}));
-
-    tensor<tuple<source_t, flux_t>, q * q> output;
-
     for (int qy = 0; qy < q; qy++) {
       for (int qx = 0; qx < q; qx++) {
         double              phi_j      = B(qx, jx) * B(qy, jy);
@@ -180,11 +188,15 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
         auto& d10 = get<0>(get<1>(input(Q)));
         auto& d11 = get<1>(get<1>(input(Q)));
 
-        output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
+        (*output)[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
       }
     }
+  }
 
-    return output;
+  template <int q>
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, q * q>{};
   }
 
   // we want to compute the following:
@@ -202,7 +214,8 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
   // A(dy, qx)  := B(qx, dx) * X_e(dy, dx)
   // X_q(qy, qx) := B(qy, dy) * A(dy, qx)
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static void interpolate(const dof_type&               X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q * q>* output_ptr, RAJA::LaunchContext ctx)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -223,8 +236,8 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
 
     // transpose the quadrature data into a flat tensor of tuples
     union {
-      tensor<qf_input_type, q * q>                                    one_dimensional;
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, q, q> two_dimensional;
+      tensor<qf_input_type, q * q>                                   one_dimensional;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, q, q> two_dimensional;
     } output;
 
     for (int qy = 0; qy < q; qy++) {
@@ -237,8 +250,14 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
         }
       }
     }
-
-    return output.one_dimensional;
+    RAJA::TypedRangeSegment<int> x_range(0, BLOCK_SZ);
+    if (output_ptr) {
+      RAJA::loop<RAJA::LoopPolicy<RAJA::seq_exec>>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.one_dimensional)) {
+          ((*output_ptr))[tid] = output.one_dimensional[tid];
+        }
+      });
+    }
   }
 
   // source can be one of: {zero, double, tensor<double,dim>, tensor<double,dim,dim>}
@@ -247,7 +266,7 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          int step = 1)
+                                          RAJA::LaunchContext, int                         step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
@@ -255,8 +274,8 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
 
     constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
 
-    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q> >;
-    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q> >;
+    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q>>;
+    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q>>;
 
     static constexpr bool apply_weights = true;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -293,7 +312,7 @@ struct finite_element<mfem::Geometry::SQUARE, H1<p, c> > {
 #if 0
 
   template <int q>
-  static SERAC_DEVICE auto interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
+  static SERAC_DEVICE void interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
                                        const TensorProductQuadratureRule<q>& rule, cache_type<q>& A)
   {
     int tidx = threadIdx.x % q;
diff --git a/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl b/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl
index 815b5b72e..4ff075cb6 100644
--- a/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl
+++ b/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl
@@ -10,6 +10,8 @@
  * @brief Specialization of finite_element for Hcurl on quadrilateral geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their curls) that
 // interpolate at Gauss-Lobatto nodes for closed intervals, and Gauss-Legendre
 // nodes for open intervals.
@@ -21,8 +23,8 @@
 //         rather than 3D vector along the z-direction.
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p>
-struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
+template <int p, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::SQUARE, Hcurl<p>, exec> {
   static constexpr auto geometry   = mfem::Geometry::SQUARE;
   static constexpr auto family     = Family::HCURL;
   static constexpr int  dim        = 2;
@@ -36,6 +38,14 @@ struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
   using residual_type =
       typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(dot(get<0>(get<0>(in_t{})), tensor<double, 2>{}) + get<1>(get<0>(in_t{})) * double{});
+    using flux_t   = decltype(dot(get<0>(get<1>(in_t{})), tensor<double, 2>{}) + get<1>(get<1>(in_t{})) * double{});
+
+    using type = tensor<tuple<source_t, flux_t>, q * q>;
+  };
+
   // this is how mfem provides the data to us for these elements
   // if, instead, it was stored as simply tensor< double, 2, p + 1, p >,
   // the interpolation/integrate implementation would be considerably shorter
@@ -243,7 +253,9 @@ struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q> input, const TensorProductQuadratureRule<q>&)
+  static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
     constexpr bool                     apply_weights = false;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -260,11 +272,6 @@ struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
       jy = (j % ((p + 1) * p)) / n;
     }
 
-    using source_t = decltype(dot(get<0>(get<0>(in_t{})), tensor<double, 2>{}) + get<1>(get<0>(in_t{})) * double{});
-    using flux_t   = decltype(dot(get<0>(get<1>(in_t{})), tensor<double, 2>{}) + get<1>(get<1>(in_t{})) * double{});
-
-    tensor<tuple<source_t, flux_t>, q * q> output;
-
     for (int qy = 0; qy < q; qy++) {
       for (int qx = 0; qx < q; qx++) {
         tensor<double, 2> phi_j{(dir == 0) * B1(qx, jx) * B2(qy, jy), (dir == 1) * B1(qy, jy) * B2(qx, jx)};
@@ -277,15 +284,21 @@ struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
         auto& d10 = get<0>(get<1>(input(Q)));
         auto& d11 = get<1>(get<1>(input(Q)));
 
-        output[Q] = {dot(d00, phi_j) + d01 * curl_phi_j, dot(d10, phi_j) + d11 * curl_phi_j};
+        (*output)[Q] = {dot(d00, phi_j) + d01 * curl_phi_j, dot(d10, phi_j) + d11 * curl_phi_j};
       }
     }
-
-    return output;
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
+  static auto interpolate_output_helper()
+  {
+    return tensor<tuple<tensor<double, 2>, double>, q * q>{};
+  };
+
+  template <int q>
+  SERAC_HOST_DEVICE static void interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&,
+                                            tensor<tuple<tensor<double, 2>, double>, q * q>* output_ptr,
+                                            RAJA::LaunchContext)
   {
     constexpr bool                     apply_weights = false;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -312,20 +325,18 @@ struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
     for (int qy = 0; qy < q; qy++) {
       for (int qx = 0; qx < q; qx++) {
         for (int i = 0; i < dim; i++) {
-          get<VALUE>(qf_inputs(count))[i] = value[i](qy, qx);
+          get<VALUE>((*output_ptr)(count))[i] = value[i](qy, qx);
         }
-        get<CURL>(qf_inputs(count)) = curl(qy, qx);
+        get<CURL>((*output_ptr)(count)) = curl(qy, qx);
         count++;
       }
     }
-
-    return qf_inputs;
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          [[maybe_unused]] int step = 1)
+                                          RAJA::LaunchContext, [[maybe_unused]] int        step = 1)
   {
     constexpr bool                     apply_weights = true;
     constexpr tensor<double, q, p>     B1            = calculate_B1<apply_weights, q>();
@@ -359,7 +370,7 @@ struct finite_element<mfem::Geometry::SQUARE, Hcurl<p> > {
 #if 0
 
   template <int q>
-  static SERAC_DEVICE auto interpolate(const dof_type& element_values, const tensor<double, dim, dim>& J,
+  static SERAC_DEVICE void interpolate(const dof_type& element_values, const tensor<double, dim, dim>& J,
                                        const TensorProductQuadratureRule<q>& rule, cache_type<q>& A)
   {
     int tidx = threadIdx.x % q;
diff --git a/src/serac/numerics/functional/detail/quadrilateral_L2.inl b/src/serac/numerics/functional/detail/quadrilateral_L2.inl
index 48ba045f0..68eeb15ae 100644
--- a/src/serac/numerics/functional/detail/quadrilateral_L2.inl
+++ b/src/serac/numerics/functional/detail/quadrilateral_L2.inl
@@ -10,14 +10,16 @@
  * @brief Specialization of finite_element for L2 on quadrilateral geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
 // note: mfem assumes the parent element domain is [0,1]x[0,1]
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
+template <int p, int c, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::SQUARE, L2<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::SQUARE;
   static constexpr auto family     = Family::L2;
   static constexpr int  components = c;
@@ -29,15 +31,23 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
   static constexpr int SOURCE = 0, FLUX = 1;
 
   using residual_type =
-      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
+      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
 
   using dof_type = tensor<double, c, p + 1, p + 1>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, 2>{}));
+    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, 2>{}));
+
+    using type = tensor<tuple<source_t, flux_t>, q * q>;
+  };
+
   /*
 
     interpolation nodes and their associated numbering:
@@ -154,7 +164,9 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q> input, const TensorProductQuadratureRule<q>&)
+  static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q * q> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -163,11 +175,6 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
     int jx = j % n;
     int jy = j / n;
 
-    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, 2>{}));
-    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, 2>{}));
-
-    tensor<tuple<source_t, flux_t>, q * q> output;
-
     for (int qy = 0; qy < q; qy++) {
       for (int qx = 0; qx < q; qx++) {
         double              phi_j      = B(qx, jx) * B(qy, jy);
@@ -179,11 +186,15 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
         auto& d10 = get<0>(get<1>(input(Q)));
         auto& d11 = get<1>(get<1>(input(Q)));
 
-        output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
+        (*output)[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
       }
     }
+  }
 
-    return output;
+  template <int q>
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, q * q>{};
   }
 
   // we want to compute the following:
@@ -201,7 +212,8 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
   // A(dy, qx)  := B(qx, dx) * X_e(dy, dx)
   // X_q(qy, qx) := B(qy, dy) * A(dy, qx)
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static void interpolate(const dof_type&               X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q * q>* output_ptr, RAJA::LaunchContext ctx)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -222,8 +234,8 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
 
     // transpose the quadrature data into a flat tensor of tuples
     union {
-      tensor<qf_input_type, q * q>                                    one_dimensional;
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, q, q> two_dimensional;
+      tensor<qf_input_type, q * q>                                   one_dimensional;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, q, q> two_dimensional;
     } output;
 
     for (int qy = 0; qy < q; qy++) {
@@ -237,7 +249,14 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
       }
     }
 
-    return output.one_dimensional;
+    RAJA::TypedRangeSegment<int> x_range(0, BLOCK_SZ);
+    if (output_ptr) {
+      RAJA::loop<RAJA::LoopPolicy<RAJA::seq_exec>>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.one_dimensional)) {
+          ((*output_ptr))[tid] = output.one_dimensional[tid];
+        }
+      });
+    }
   }
 
   // source can be one of: {zero, double, tensor<double,dim>, tensor<double,dim,dim>}
@@ -246,7 +265,7 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          int step = 1)
+                                          RAJA::LaunchContext, int                         step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
@@ -254,8 +273,8 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
 
     constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
 
-    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q> >;
-    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q> >;
+    using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q>>;
+    using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q>>;
 
     static constexpr bool apply_weights = true;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -291,7 +310,7 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
 #if 0
 
   template <int q>
-  static SERAC_DEVICE auto interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
+  static SERAC_DEVICE void interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
                                        const TensorProductQuadratureRule<q>& rule, cache_type<q>& A)
   {
     int tidx = threadIdx.x % q;
@@ -352,8 +371,6 @@ struct finite_element<mfem::Geometry::SQUARE, L2<p, c> > {
     }
 
     get<1>(qf_input) = dot(get<1>(qf_input), inv(J));
-
-    return qf_input;
   }
 
   template <typename T1, typename T2, int q>
diff --git a/src/serac/numerics/functional/detail/segment_H1.inl b/src/serac/numerics/functional/detail/segment_H1.inl
index 11fe30bc2..3a9430516 100644
--- a/src/serac/numerics/functional/detail/segment_H1.inl
+++ b/src/serac/numerics/functional/detail/segment_H1.inl
@@ -10,14 +10,16 @@
  * @brief Specialization of finite_element for H1 on segment geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
 // note: mfem assumes the parent element domain is [0,1]
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::SEGMENT, H1<p, c> > {
+template <int p, int c, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::SEGMENT, H1<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::SEGMENT;
   static constexpr auto family     = Family::H1;
   static constexpr int  components = c;
@@ -37,6 +39,14 @@ struct finite_element<mfem::Geometry::SEGMENT, H1<p, c> > {
   using residual_type =
       typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
 
+  template <typename T, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(get<0>(get<0>(T{})) + get<1>(get<0>(T{})));
+    using flux_t   = decltype(get<0>(get<1>(T{})) + get<1>(get<1>(T{})));
+
+    using type = tensor<tuple<source_t, flux_t>, q>;
+  };
+
   SERAC_HOST_DEVICE static constexpr tensor<double, ndof> shape_functions(double xi)
   {
     return GaussLobattoInterpolation<ndof>(xi);
@@ -94,17 +104,14 @@ struct finite_element<mfem::Geometry::SEGMENT, H1<p, c> > {
   }
 
   template <typename T, int q>
-  SERAC_HOST_DEVICE static auto batch_apply_shape_fn(int jx, tensor<T, q> input, const TensorProductQuadratureRule<q>&)
+  RAJA_HOST_DEVICE static auto RAJA_HOST_DEVICE
+  batch_apply_shape_fn(int jx, tensor<T, q> input, typename batch_apply_shape_fn_output<T, q>::type* output,
+                       const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
     static constexpr auto G             = calculate_G<apply_weights, q>();
 
-    using source_t = decltype(get<0>(get<0>(T{})) + get<1>(get<0>(T{})));
-    using flux_t   = decltype(get<0>(get<1>(T{})) + get<1>(get<1>(T{})));
-
-    tensor<tuple<source_t, flux_t>, q> output;
-
     for (int qx = 0; qx < q; qx++) {
       double phi_j      = B(qx, jx);
       double dphi_j_dxi = G(qx, jx);
@@ -114,14 +121,19 @@ struct finite_element<mfem::Geometry::SEGMENT, H1<p, c> > {
       auto& d10 = get<0>(get<1>(input(qx)));
       auto& d11 = get<1>(get<1>(input(qx)));
 
-      output[qx] = {d00 * phi_j + d01 * dphi_j_dxi, d10 * phi_j + d11 * dphi_j_dxi};
+      (*output)[qx] = {d00 * phi_j + d01 * dphi_j_dxi, d10 * phi_j + d11 * dphi_j_dxi};
     }
+  }
 
-    return output;
+  template <int q>
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, q>{};
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static void interpolate(const dof_type&           X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q>* output_ptr, RAJA::LaunchContext)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -137,27 +149,23 @@ struct finite_element<mfem::Geometry::SEGMENT, H1<p, c> > {
     }
 
     // transpose the quadrature data into a tensor of tuples
-    tensor<qf_input_type, q> output;
-
     for (int qx = 0; qx < q; qx++) {
       if constexpr (c == 1) {
-        get<VALUE>(output(qx))    = value(0, qx);
-        get<GRADIENT>(output(qx)) = gradient(0, qx);
+        get<VALUE>((*output_ptr)(qx))    = value(0, qx);
+        get<GRADIENT>((*output_ptr)(qx)) = gradient(0, qx);
       } else {
         for (int i = 0; i < c; i++) {
-          get<VALUE>(output(qx))[i]    = value(i, qx);
-          get<GRADIENT>(output(qx))[i] = gradient(i, qx);
+          get<VALUE>((*output_ptr)(qx))[i]    = value(i, qx);
+          get<GRADIENT>((*output_ptr)(qx))[i] = gradient(i, qx);
         }
       }
     }
-
-    return output;
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          [[maybe_unused]] int step = 1)
+                                          RAJA::LaunchContext, [[maybe_unused]] int        step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/segment_Hcurl.inl b/src/serac/numerics/functional/detail/segment_Hcurl.inl
index 98677f684..9e1986aa7 100644
--- a/src/serac/numerics/functional/detail/segment_Hcurl.inl
+++ b/src/serac/numerics/functional/detail/segment_Hcurl.inl
@@ -18,8 +18,8 @@
 // note: mfem assumes the parent element domain is [0,1]
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::SEGMENT, Hcurl<p, c> > {
+template <int p, int c, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::SEGMENT, Hcurl<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::SEGMENT;
   static constexpr auto family     = Family::HCURL;
   static constexpr int  components = c;
diff --git a/src/serac/numerics/functional/detail/segment_L2.inl b/src/serac/numerics/functional/detail/segment_L2.inl
index d14603793..fffd55724 100644
--- a/src/serac/numerics/functional/detail/segment_L2.inl
+++ b/src/serac/numerics/functional/detail/segment_L2.inl
@@ -10,14 +10,16 @@
  * @brief Specialization of finite_element for L2 on segment geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
 // note: mfem assumes the parent element domain is [0,1]
 // for additional information on the finite_element concept requirements, see finite_element.hpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::SEGMENT, L2<p, c> > {
+template <int p, int c, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::SEGMENT, L2<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::SEGMENT;
   static constexpr auto family     = Family::L2;
   static constexpr int  components = c;
@@ -37,6 +39,14 @@ struct finite_element<mfem::Geometry::SEGMENT, L2<p, c> > {
   using residual_type =
       typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
 
+  template <typename T, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(get<0>(get<0>(T{})) + get<1>(get<0>(T{})));
+    using flux_t   = decltype(get<0>(get<1>(T{})) + get<1>(get<1>(T{})));
+
+    using type = tensor<tuple<source_t, flux_t>, q>;
+  };
+
   SERAC_HOST_DEVICE static constexpr tensor<double, ndof> shape_functions(double xi)
   {
     return GaussLobattoInterpolation<ndof>(xi);
@@ -94,17 +104,14 @@ struct finite_element<mfem::Geometry::SEGMENT, L2<p, c> > {
   }
 
   template <typename T, int q>
-  static auto batch_apply_shape_fn(int jx, tensor<T, q> input, const TensorProductQuadratureRule<q>&)
+  static void RAJA_HOST_DEVICE batch_apply_shape_fn(int jx, tensor<T, q> input,
+                                                    typename batch_apply_shape_fn_output<T, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
     static constexpr auto G             = calculate_G<apply_weights, q>();
 
-    using source_t = decltype(get<0>(get<0>(T{})) + get<1>(get<0>(T{})));
-    using flux_t   = decltype(get<0>(get<1>(T{})) + get<1>(get<1>(T{})));
-
-    tensor<tuple<source_t, flux_t>, q> output;
-
     for (int qx = 0; qx < q; qx++) {
       double phi_j      = B(qx, jx);
       double dphi_j_dxi = G(qx, jx);
@@ -114,14 +121,19 @@ struct finite_element<mfem::Geometry::SEGMENT, L2<p, c> > {
       auto& d10 = get<0>(get<1>(input(qx)));
       auto& d11 = get<1>(get<1>(input(qx)));
 
-      output[qx] = {d00 * phi_j + d01 * dphi_j_dxi, d10 * phi_j + d11 * dphi_j_dxi};
+      (*output)[qx] = {d00 * phi_j + d01 * dphi_j_dxi, d10 * phi_j + d11 * dphi_j_dxi};
     }
+  }
 
-    return output;
+  template <int q>
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, q>{};
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static void interpolate(const dof_type&           X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q>* output_ptr, RAJA::LaunchContext)
   {
     static constexpr bool apply_weights = false;
     static constexpr auto B             = calculate_B<apply_weights, q>();
@@ -136,28 +148,23 @@ struct finite_element<mfem::Geometry::SEGMENT, L2<p, c> > {
       gradient(i) = dot(G, X[i]);
     }
 
-    // transpose the quadrature data into a tensor of tuples
-    tensor<qf_input_type, q> output;
-
     for (int qx = 0; qx < q; qx++) {
       if constexpr (c == 1) {
-        get<VALUE>(output(qx))    = value(0, qx);
-        get<GRADIENT>(output(qx)) = gradient(0, qx);
+        get<VALUE>((*output_ptr)(qx))    = value(0, qx);
+        get<GRADIENT>((*output_ptr)(qx)) = gradient(0, qx);
       } else {
         for (int i = 0; i < c; i++) {
-          get<VALUE>(output(qx))[i]    = value(i, qx);
-          get<GRADIENT>(output(qx))[i] = gradient(i, qx);
+          get<VALUE>((*output_ptr)(qx))[i]    = value(i, qx);
+          get<GRADIENT>((*output_ptr)(qx))[i] = gradient(i, qx);
         }
       }
     }
-
-    return output;
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q>& qf_output,
                                           const TensorProductQuadratureRule<q>&, dof_type* element_residual,
-                                          [[maybe_unused]] int step = 1)
+                                          RAJA::LaunchContext, [[maybe_unused]] int        step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
diff --git a/src/serac/numerics/functional/detail/tetrahedron_H1.inl b/src/serac/numerics/functional/detail/tetrahedron_H1.inl
index 2a7613cee..df229336d 100644
--- a/src/serac/numerics/functional/detail/tetrahedron_H1.inl
+++ b/src/serac/numerics/functional/detail/tetrahedron_H1.inl
@@ -10,6 +10,8 @@
  * @brief Specialization of finite_element for H1 on tetrahedron geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
@@ -19,8 +21,8 @@
 // for detailed information about node locations for different polynomial orders, see
 // simplex_basis_function_unit_tests.cpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::TETRAHEDRON, H1<p, c> > {
+template <int p, int c, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::TETRAHEDRON, H1<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::TETRAHEDRON;
   static constexpr auto family     = Family::H1;
   static constexpr int  components = c;
@@ -34,15 +36,22 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, H1<p, c> > {
   static constexpr int SOURCE = 0, FLUX = 1;
 
   using residual_type =
-      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
+      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
 
   using dof_type = tensor<double, c, ndof>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
+    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
+    using type     = tensor<tuple<source_t, flux_t>, nqpts(q)>;
+  };
+
   SERAC_HOST_DEVICE static constexpr double shape_function([[maybe_unused]] tensor<double, dim> xi, int i)
   {
     if constexpr (p == 1) {
@@ -336,16 +345,14 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, H1<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, nqpts(q)> input, const TensorProductQuadratureRule<q>&)
+  static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, nqpts(q)> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext& ctx)
   {
-    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
-    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
-
     constexpr auto xi = GaussLegendreNodes<q, mfem::Geometry::TETRAHEDRON>();
 
-    tensor<tuple<source_t, flux_t>, nqpts(q)> output;
-
-    for (int i = 0; i < nqpts(q); i++) {
+    auto x_range = RAJA::RangeSegment(0, nqpts(q));
+    RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, x_range, [&](int i) {
       double              phi_j      = shape_function(xi[i], j);
       tensor<double, dim> dphi_j_dxi = shape_function_gradient(xi[i], j);
 
@@ -354,21 +361,28 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, H1<p, c> > {
       auto& d10 = get<0>(get<1>(input(i)));
       auto& d11 = get<1>(get<1>(input(i)));
 
-      output[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
-    }
+      (*output)[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
+    });
 
     return output;
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, nqpts(q)>{};
+  }
+
+  template <int q>
+  SERAC_HOST_DEVICE static void interpolate(const tensor<double, c, ndof>&   X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, nqpts(q)>* output_ptr, RAJA::LaunchContext ctx)
   {
     constexpr auto xi = GaussLegendreNodes<q, mfem::Geometry::TETRAHEDRON>();
 
     // transpose the quadrature data into a flat tensor of tuples
     union {
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, nqpts(q)> unflattened;
-      tensor<qf_input_type, nqpts(q)>                                     flattened;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, nqpts(q)> unflattened;
+      tensor<qf_input_type, nqpts(q)>                                    flattened;
     } output{};
 
     for (int i = 0; i < c; i++) {
@@ -380,20 +394,27 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, H1<p, c> > {
       }
     }
 
-    return output.flattened;
+    RAJA::TypedRangeSegment<int> x_range(0, BLOCK_SZ);
+    if (output_ptr) {
+      RAJA::loop<RAJA::LoopPolicy<RAJA::seq_exec>>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.flattened)) {
+          (*output_ptr)[tid] = output.flattened[tid];
+        }
+      });
+    }
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, nqpts(q)>& qf_output,
                                           const TensorProductQuadratureRule<q>&,
-                                          tensor<double, c, ndof>* element_residual, int step = 1)
+                                          tensor<double, c, ndof>* element_residual, RAJA::LaunchContext, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
     }
 
     using source_component_type = std::conditional_t<is_zero<source_type>{}, zero, double>;
-    using flux_component_type   = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim> >;
+    using flux_component_type   = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim>>;
 
     constexpr int  ntrial              = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
     constexpr auto integration_points  = GaussLegendreNodes<q, mfem::Geometry::TETRAHEDRON>();
diff --git a/src/serac/numerics/functional/detail/tetrahedron_L2.inl b/src/serac/numerics/functional/detail/tetrahedron_L2.inl
index 25bc8ffd3..092ca905b 100644
--- a/src/serac/numerics/functional/detail/tetrahedron_L2.inl
+++ b/src/serac/numerics/functional/detail/tetrahedron_L2.inl
@@ -10,6 +10,8 @@
  * @brief Specialization of finite_element for L2 on tetrahedron geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
@@ -18,8 +20,8 @@
 //
 // for exact positions of nodes for different polynomial orders, see simplex_basis_function_unit_tests.cpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::TETRAHEDRON, L2<p, c> > {
+template <int p, int c, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::TETRAHEDRON, L2<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::TETRAHEDRON;
   static constexpr auto family     = Family::L2;
   static constexpr int  components = c;
@@ -32,15 +34,22 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, L2<p, c> > {
   static constexpr int SOURCE = 0, FLUX = 1;
 
   using residual_type =
-      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
+      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
 
   using dof_type = tensor<double, c, ndof>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
+    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
+    using type     = tensor<tuple<source_t, flux_t>, nqpts(q)>;
+  };
+
   SERAC_HOST_DEVICE static constexpr double shape_function([[maybe_unused]] tensor<double, dim> xi,
                                                            [[maybe_unused]] int                 i)
   {
@@ -341,15 +350,12 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, L2<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, nqpts(q)> input, const TensorProductQuadratureRule<q>&)
+  static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, nqpts(q)> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
-    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
-    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
-
     constexpr auto xi = GaussLegendreNodes<q, mfem::Geometry::TETRAHEDRON>();
 
-    tensor<tuple<source_t, flux_t>, nqpts(q)> output;
-
     for (int i = 0; i < nqpts(q); i++) {
       double              phi_j      = shape_function(xi[i], j);
       tensor<double, dim> dphi_j_dxi = shape_function_gradient(xi[i], j);
@@ -359,21 +365,28 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, L2<p, c> > {
       auto& d10 = get<0>(get<1>(input(i)));
       auto& d11 = get<1>(get<1>(input(i)));
 
-      output[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
+      (*output)[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
     }
 
     return output;
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, nqpts(q)>{};
+  }
+
+  template <int q>
+  SERAC_HOST_DEVICE static void interpolate(const tensor<double, c, ndof>&   X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, nqpts(q)>* output_ptr, RAJA::LaunchContext ctx)
   {
     constexpr auto xi = GaussLegendreNodes<q, mfem::Geometry::TETRAHEDRON>();
 
     // transpose the quadrature data into a flat tensor of tuples
     union {
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, nqpts(q)> unflattened;
-      tensor<qf_input_type, nqpts(q)>                                     flattened;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, nqpts(q)> unflattened;
+      tensor<qf_input_type, nqpts(q)>                                    flattened;
     } output{};
 
     for (int i = 0; i < c; i++) {
@@ -385,20 +398,28 @@ struct finite_element<mfem::Geometry::TETRAHEDRON, L2<p, c> > {
       }
     }
 
-    return output.flattened;
+    RAJA::TypedRangeSegment<int> x_range(0, BLOCK_SZ);
+    if (output_ptr) {
+      RAJA::loop<RAJA::LoopPolicy<RAJA::seq_exec>>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.flattened)) {
+          get<VALUE>(((*output_ptr))[tid])    = get<VALUE>(output.flattened[tid]);
+          get<GRADIENT>(((*output_ptr))[tid]) = get<GRADIENT>(output.flattened[tid]);
+        }
+      });
+    }
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, nqpts(q)>& qf_output,
                                           const TensorProductQuadratureRule<q>&,
-                                          tensor<double, c, ndof>* element_residual, int step = 1)
+                                          tensor<double, c, ndof>* element_residual, RAJA::LaunchContext, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
     }
 
     using source_component_type = std::conditional_t<is_zero<source_type>{}, zero, double>;
-    using flux_component_type   = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim> >;
+    using flux_component_type   = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim>>;
 
     constexpr int  ntrial              = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
     constexpr auto integration_points  = GaussLegendreNodes<q, mfem::Geometry::TETRAHEDRON>();
diff --git a/src/serac/numerics/functional/detail/triangle_H1.inl b/src/serac/numerics/functional/detail/triangle_H1.inl
index 37c1600fa..6b4b5623d 100644
--- a/src/serac/numerics/functional/detail/triangle_H1.inl
+++ b/src/serac/numerics/functional/detail/triangle_H1.inl
@@ -10,6 +10,8 @@
  * @brief Specialization of finite_element for H1 on triangle geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
@@ -19,8 +21,8 @@
 // for detailed information about node locations for different polynomial orders, see
 // simplex_basis_function_unit_tests.cpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::TRIANGLE, H1<p, c> > {
+template <int p, int c, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::TRIANGLE, H1<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::TRIANGLE;
   static constexpr auto family     = Family::H1;
   static constexpr int  components = c;
@@ -33,15 +35,23 @@ struct finite_element<mfem::Geometry::TRIANGLE, H1<p, c> > {
   static constexpr int SOURCE = 0, FLUX = 1;
 
   using residual_type =
-      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
+      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
 
   using dof_type = tensor<double, c, ndof>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t         = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, 2>{}));
+    using flux_t           = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, 2>{}));
+    static constexpr int Q = q * (q + 1) / 2;
+    using type             = tensor<tuple<source_t, flux_t>, Q>;
+  };
+
   /*
 
     interpolation nodes and their associated numbering:
@@ -240,15 +250,12 @@ struct finite_element<mfem::Geometry::TRIANGLE, H1<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q*(q + 1) / 2> input, const TensorProductQuadratureRule<q>&)
+  static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q*(q + 1) / 2> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
-    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, 2>{}));
-    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, 2>{}));
-
-    constexpr auto xi = GaussLegendreNodes<q, mfem::Geometry::TRIANGLE>();
-
-    static constexpr int               Q = q * (q + 1) / 2;
-    tensor<tuple<source_t, flux_t>, Q> output;
+    constexpr auto       xi = GaussLegendreNodes<q, mfem::Geometry::TRIANGLE>();
+    static constexpr int Q  = q * (q + 1) / 2;
 
     for (int i = 0; i < Q; i++) {
       double              phi_j      = shape_function(xi[i], j);
@@ -259,22 +266,27 @@ struct finite_element<mfem::Geometry::TRIANGLE, H1<p, c> > {
       auto& d10 = get<0>(get<1>(input(i)));
       auto& d11 = get<1>(get<1>(input(i)));
 
-      output[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
+      (*output)[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
     }
+  }
 
-    return output;
+  template <int q>
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, q*(q + 1) / 2>{};
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static void interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q*(q + 1) / 2>* output_ptr, RAJA::LaunchContext ctx)
   {
     constexpr auto       xi                    = GaussLegendreNodes<q, mfem::Geometry::TRIANGLE>();
     static constexpr int num_quadrature_points = q * (q + 1) / 2;
 
     // transpose the quadrature data into a flat tensor of tuples
     union {
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, num_quadrature_points> unflattened;
-      tensor<qf_input_type, num_quadrature_points>                                     flattened;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, num_quadrature_points> unflattened;
+      tensor<qf_input_type, num_quadrature_points>                                    flattened;
     } output{};
 
     for (int i = 0; i < c; i++) {
@@ -286,20 +298,29 @@ struct finite_element<mfem::Geometry::TRIANGLE, H1<p, c> > {
       }
     }
 
-    return output.flattened;
+    RAJA::TypedRangeSegment<int> x_range(0, BLOCK_SZ);
+
+    if (output_ptr) {
+      RAJA::loop<RAJA::LoopPolicy<RAJA::seq_exec>>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.flattened)) {
+          get<VALUE>(((*output_ptr))[tid])    = get<VALUE>(output.flattened[tid]);
+          get<GRADIENT>(((*output_ptr))[tid]) = get<GRADIENT>(output.flattened[tid]);
+        }
+      });
+    }
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q*(q + 1) / 2>& qf_output,
                                           const TensorProductQuadratureRule<q>&,
-                                          tensor<double, c, ndof>* element_residual, int step = 1)
+                                          tensor<double, c, ndof>* element_residual, RAJA::LaunchContext, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
     }
 
     using source_component_type = std::conditional_t<is_zero<source_type>{}, zero, double>;
-    using flux_component_type   = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim> >;
+    using flux_component_type   = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim>>;
 
     constexpr int  num_quadrature_points = q * (q + 1) / 2;
     constexpr int  ntrial                = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
diff --git a/src/serac/numerics/functional/detail/triangle_L2.inl b/src/serac/numerics/functional/detail/triangle_L2.inl
index 2030008da..2005ca150 100644
--- a/src/serac/numerics/functional/detail/triangle_L2.inl
+++ b/src/serac/numerics/functional/detail/triangle_L2.inl
@@ -10,6 +10,8 @@
  * @brief Specialization of finite_element for L2 on triangle geometry
  */
 
+#include "RAJA/RAJA.hpp"
+
 // this specialization defines shape functions (and their gradients) that
 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
 //
@@ -18,8 +20,8 @@
 //
 // for exact positions of nodes for different polynomial orders, see simplex_basis_function_unit_tests.cpp
 /// @cond
-template <int p, int c>
-struct finite_element<mfem::Geometry::TRIANGLE, L2<p, c> > {
+template <int p, int c, ExecutionSpace exec>
+struct finite_element<mfem::Geometry::TRIANGLE, L2<p, c>, exec> {
   static constexpr auto geometry   = mfem::Geometry::TRIANGLE;
   static constexpr auto family     = Family::L2;
   static constexpr int  components = c;
@@ -31,15 +33,23 @@ struct finite_element<mfem::Geometry::TRIANGLE, L2<p, c> > {
   static constexpr int SOURCE = 0, FLUX = 1;
 
   using residual_type =
-      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
+      typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
 
   using dof_type = tensor<double, c, ndof>;
 
-  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
+  using value_type = typename std::conditional<components == 1, double, tensor<double, components>>::type;
   using derivative_type =
-      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
+      typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim>>::type;
   using qf_input_type = tuple<value_type, derivative_type>;
 
+  template <typename in_t, int q>
+  struct batch_apply_shape_fn_output {
+    using source_t         = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, 2>{}));
+    using flux_t           = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, 2>{}));
+    static constexpr int Q = q * (q + 1) / 2;
+    using type             = tensor<tuple<source_t, flux_t>, Q>;
+  };
+
   /*
 
     interpolation nodes and their associated numbering:
@@ -249,16 +259,12 @@ struct finite_element<mfem::Geometry::TRIANGLE, L2<p, c> > {
   }
 
   template <typename in_t, int q>
-  static auto batch_apply_shape_fn(int j, tensor<in_t, q*(q + 1) / 2> input, const TensorProductQuadratureRule<q>&)
+  static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor<in_t, q*(q + 1) / 2> input,
+                                                    typename batch_apply_shape_fn_output<in_t, q>::type* output,
+                                                    const TensorProductQuadratureRule<q>&, RAJA::LaunchContext)
   {
-    using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, 2>{}));
-    using flux_t   = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, 2>{}));
-
-    constexpr auto xi = GaussLegendreNodes<q, mfem::Geometry::TRIANGLE>();
-
-    static constexpr int               Q = q * (q + 1) / 2;
-    tensor<tuple<source_t, flux_t>, Q> output;
-
+    constexpr auto       xi = GaussLegendreNodes<q, mfem::Geometry::TRIANGLE>();
+    static constexpr int Q  = q * (q + 1) / 2;
     for (int i = 0; i < Q; i++) {
       double              phi_j      = shape_function(xi[i], j);
       tensor<double, dim> dphi_j_dxi = shape_function_gradient(xi[i], j);
@@ -268,22 +274,27 @@ struct finite_element<mfem::Geometry::TRIANGLE, L2<p, c> > {
       auto& d10 = get<0>(get<1>(input(i)));
       auto& d11 = get<1>(get<1>(input(i)));
 
-      output[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
+      (*output)[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
     }
+  }
 
-    return output;
+  template <int q>
+  static auto interpolate_output_helper()
+  {
+    return tensor<qf_input_type, q*(q + 1) / 2>{};
   }
 
   template <int q>
-  SERAC_HOST_DEVICE static auto interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&)
+  SERAC_HOST_DEVICE static void interpolate(const tensor<double, c, ndof>& X, const TensorProductQuadratureRule<q>&,
+                                            tensor<qf_input_type, q*(q + 1) / 2>* output_ptr, RAJA::LaunchContext ctx)
   {
     constexpr auto       xi                    = GaussLegendreNodes<q, mfem::Geometry::TRIANGLE>();
     static constexpr int num_quadrature_points = q * (q + 1) / 2;
 
     // transpose the quadrature data into a flat tensor of tuples
     union {
-      tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, num_quadrature_points> unflattened;
-      tensor<qf_input_type, num_quadrature_points>                                     flattened;
+      tensor<tuple<tensor<double, c>, tensor<double, c, dim>>, num_quadrature_points> unflattened;
+      tensor<qf_input_type, num_quadrature_points>                                    flattened;
     } output{};
 
     for (int i = 0; i < c; i++) {
@@ -295,20 +306,28 @@ struct finite_element<mfem::Geometry::TRIANGLE, L2<p, c> > {
       }
     }
 
-    return output.flattened;
+    RAJA::TypedRangeSegment<int> x_range(0, BLOCK_SZ);
+    if (output_ptr) {
+      RAJA::loop<RAJA::LoopPolicy<RAJA::seq_exec>>(ctx, x_range, [&](int tid) {
+        if (tid < serac::size(output.flattened)) {
+          get<VALUE>(((*output_ptr))[tid])    = get<VALUE>(output.flattened[tid]);
+          get<GRADIENT>(((*output_ptr))[tid]) = get<GRADIENT>(output.flattened[tid]);
+        }
+      });
+    }
   }
 
   template <typename source_type, typename flux_type, int q>
   SERAC_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q*(q + 1) / 2>& qf_output,
                                           const TensorProductQuadratureRule<q>&,
-                                          tensor<double, c, ndof>* element_residual, int step = 1)
+                                          tensor<double, c, ndof>* element_residual, RAJA::LaunchContext, int step = 1)
   {
     if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
       return;
     }
 
     using source_component_type = std::conditional_t<is_zero<source_type>{}, zero, double>;
-    using flux_component_type   = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim> >;
+    using flux_component_type   = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim>>;
 
     constexpr int  num_quadrature_points = q * (q + 1) / 2;
     constexpr int  ntrial                = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
diff --git a/src/serac/numerics/functional/dof_numbering.hpp b/src/serac/numerics/functional/dof_numbering.hpp
index 089e3065c..ba607c5d1 100644
--- a/src/serac/numerics/functional/dof_numbering.hpp
+++ b/src/serac/numerics/functional/dof_numbering.hpp
@@ -333,6 +333,7 @@ struct GradientAssemblyLookupTables {
     col_ind.resize(nnz);
 
     row_ptr[0] = 0;
+
     col_ind[0] = int(entries[0].column);
 
     for (uint32_t i = 1; i < nnz; i++) {
diff --git a/src/serac/numerics/functional/domain.cpp b/src/serac/numerics/functional/domain.cpp
index 3d39e2d0a..74ade55ec 100644
--- a/src/serac/numerics/functional/domain.cpp
+++ b/src/serac/numerics/functional/domain.cpp
@@ -240,10 +240,10 @@ static Domain domain_of_elems(const mfem::Mesh&
   mfem::Vector vertices;
   mesh.GetVertices(vertices);
 
-  int tri_id  = 0;
-  int quad_id = 0;
-  int tet_id  = 0;
-  int hex_id  = 0;
+  [[maybe_unused]] int tri_id  = 0;
+  [[maybe_unused]] int quad_id = 0;
+  [[maybe_unused]] int tet_id  = 0;
+  [[maybe_unused]] int hex_id  = 0;
 
   // elements that satisfy the predicate are added to the domain
   int num_elems = mesh.GetNE();
diff --git a/src/serac/numerics/functional/domain_integral_kernels.hpp b/src/serac/numerics/functional/domain_integral_kernels.hpp
index c218025fb..d18bdf62b 100644
--- a/src/serac/numerics/functional/domain_integral_kernels.hpp
+++ b/src/serac/numerics/functional/domain_integral_kernels.hpp
@@ -12,6 +12,8 @@
 #include "serac/numerics/functional/differentiate_wrt.hpp"
 #include "RAJA/RAJA.hpp"
 
+#include <RAJA/pattern/launch/launch_core.hpp>
+#include <RAJA/policy/sequential/policy.hpp>
 #include <array>
 #include <cstdint>
 
@@ -64,6 +66,41 @@ struct QFunctionArgument<Hcurl<p>, Dimension<3>> {
   using type = tuple<tensor<double, 3>, tensor<double, 3>>;  ///< what will be passed to the q-function
 };
 
+/**
+ *  @tparam lambda the qfunction type
+ *  @tparam state_type describes state type, if there is one
+ *  @tparam dim describes whether the problem is 1D, 2D, or 3D
+ *  @tparam n length of output tensor
+ *  @tparam T... extra args
+ *
+ *  @brief a struct used to deduce the return type of a q-function.
+ */
+template <typename lambda, typename state_type, int dim, int n, typename... T>
+struct QFunctionOutput {
+  /// @brief Typedef for the position argument of a qf func
+  using position_type = serac::tuple<tensor<double, dim>, tensor<double, dim, dim>>;
+  /// @brief static method for deducing the return type of a given qf function.
+  /// Many objects of type lambda are actually functors that aren't trivially
+  /// constructable with closed brace constructor {}, so qf must be passed in
+  /// to a function.
+  static auto get_type(const lambda& qf)
+  {
+    state_type state_l_value{};
+    return tensor<decltype(qf(double{}, position_type{}, state_l_value, T{}[0]...)), n>{};
+  }
+};
+
+/// @overload
+template <typename lambda, int dim, int n, typename... T>
+struct QFunctionOutput<lambda, Nothing, dim, n, T...> {
+  /// @brief Typedef for the position argument of a qf func
+  using position_type = serac::tuple<tensor<double, dim>, tensor<double, dim, dim>>;
+  /// @brief This method is identical to the above get_type method but doesn't
+  /// trivially construct a state, because this partial specialization is for
+  /// Nothing type states.
+  static auto get_type(const lambda& qf) { return tensor<decltype(qf(double{}, position_type{}, T{}[0]...)), n>{}; }
+};
+
 /// @brief layer of indirection needed to unpack the entries of the argument tuple
 SERAC_SUPPRESS_NVCC_HOSTDEVICE_WARNING
 template <typename lambda, typename coords_type, typename T, typename qpt_data_type, int... i>
@@ -102,14 +139,14 @@ auto get_derivative_type(const lambda& qf, qpt_data_type qpt_data)
                                make_dual_wrt<i>(qf_arguments{})));
 };
 
-template <typename lambda, int dim, int n, typename... T>
-SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(const lambda& qf, double t, const tensor<double, dim, n>& x,
-                                               const tensor<double, dim, dim, n>& J, const T&... inputs)
+template <ExecutionSpace exec, typename lambda, int dim, int n, typename... T>
+SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(
+    const lambda& qf, double t, const tensor<double, dim, n> x, const tensor<double, dim, dim, n> J,
+    decltype(QFunctionOutput<lambda, Nothing, dim, n, T...>::get_type(qf))* qf_output, RAJA::LaunchContext ctx,
+    const T&... inputs)
 {
-  using position_t  = serac::tuple<tensor<double, dim>, tensor<double, dim, dim>>;
-  using return_type = decltype(qf(double{}, position_t{}, T{}[0]...));
-  tensor<return_type, n> outputs{};
-  for (int i = 0; i < n; i++) {
+  RAJA::RangeSegment x_range(0, n);
+  RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, x_range, [&](int i) {
     tensor<double, dim>      x_q;
     tensor<double, dim, dim> J_q;
     for (int j = 0; j < dim; j++) {
@@ -118,20 +155,18 @@ SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(const lambda& qf, double t, const
       }
       x_q[j] = x(j, i);
     }
-    outputs[i] = qf(t, serac::tuple{x_q, J_q}, inputs[i]...);
-  }
-  return outputs;
+    (*qf_output)[i] = qf(t, serac::tuple{x_q, J_q}, inputs[i]...);
+  });
 }
 
-template <typename lambda, int dim, int n, typename qpt_data_type, typename... T>
-SERAC_HOST_DEVICE auto batch_apply_qf(const lambda& qf, double t, const tensor<double, dim, n>& x,
-                                      const tensor<double, dim, dim, n>& J, qpt_data_type* qpt_data, bool update_state,
-                                      const T&... inputs)
+template <ExecutionSpace exec, typename lambda, int dim, int n, typename qpt_data_type, typename... T>
+SERAC_HOST_DEVICE auto batch_apply_qf(
+    const lambda& qf, double t, const tensor<double, dim, n> x, const tensor<double, dim, dim, n> J,
+    decltype(QFunctionOutput<lambda, qpt_data_type, dim, n, T...>::get_type(qf))* qf_output, qpt_data_type* qpt_data,
+    bool update_state, RAJA::LaunchContext ctx, const T&... inputs)
 {
-  using position_t  = serac::tuple<tensor<double, dim>, tensor<double, dim, dim>>;
-  using return_type = decltype(qf(double{}, position_t{}, qpt_data[0], T{}[0]...));
-  tensor<return_type, n> outputs{};
-  for (int i = 0; i < n; i++) {
+  RAJA::RangeSegment x_range(0, n);
+  RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, x_range, [&](int i) {
     tensor<double, dim>      x_q;
     tensor<double, dim, dim> J_q;
     for (int j = 0; j < dim; j++) {
@@ -140,19 +175,18 @@ SERAC_HOST_DEVICE auto batch_apply_qf(const lambda& qf, double t, const tensor<d
       }
       x_q[j] = x(j, i);
     }
-    auto qdata = qpt_data[i];
-    outputs[i] = qf(t, serac::tuple{x_q, J_q}, qdata, inputs[i]...);
+    auto qdata      = qpt_data[i];
+    (*qf_output)[i] = qf(t, serac::tuple{x_q, J_q}, qdata, inputs[i]...);
     if (update_state) {
       qpt_data[i] = qdata;
     }
-  }
-  return outputs;
+  });
 }
 
-template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, typename test_element,
-          typename trial_element_tuple, typename lambda_type, typename state_type, typename derivative_type,
-          int... indices>
-void evaluation_kernel_impl(trial_element_tuple trial_elements, test_element, double t,
+template <uint32_t differentiation_index, int Q, mfem::Geometry::Type geom, ExecutionSpace exec,
+          typename test_element_type, typename trial_element_tuple_type, typename lambda_type, typename state_type,
+          typename derivative_type, int... indices>
+void evaluation_kernel_impl(trial_element_tuple_type trial_elements, test_element_type, double t,
                             const std::vector<const double*>& inputs, double* outputs, const double* positions,
                             const double* jacobians, lambda_type qf,
                             [[maybe_unused]] axom::ArrayView<state_type, 2> qf_state,
@@ -161,62 +195,117 @@ void evaluation_kernel_impl(trial_element_tuple trial_elements, test_element, do
 {
   // mfem provides this information as opaque arrays of doubles,
   // so we reinterpret the pointer with
-  auto                           r = reinterpret_cast<typename test_element::dof_type*>(outputs);
-  auto                           x = reinterpret_cast<const typename batched_position<geom, Q>::type*>(positions);
-  auto                           J = reinterpret_cast<const typename batched_jacobian<geom, Q>::type*>(jacobians);
+  using X_Type = typename batched_position<geom, Q>::type;
+  using J_Type = typename batched_jacobian<geom, Q>::type;
+  auto r       = reinterpret_cast<typename test_element_type::dof_type*>(outputs);
+  auto x       = const_cast<X_Type*>(reinterpret_cast<const X_Type*>(positions));
+  auto J       = const_cast<J_Type*>(reinterpret_cast<const J_Type*>(jacobians));
+
   TensorProductQuadratureRule<Q> rule{};
 
-  [[maybe_unused]] auto qpts_per_elem = num_quadrature_points(geom, Q);
+  auto          qpts_per_elem = num_quadrature_points(geom, Q);
+  constexpr int dim           = dimension<0>(X_Type{});
+  constexpr int n             = dimension<1>(X_Type{});
 
   [[maybe_unused]] tuple u = {
       reinterpret_cast<const typename decltype(type<indices>(trial_elements))::dof_type*>(inputs[indices])...};
-
-  // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; ++e) {
-    // load the jacobians and positions for each quadrature point in this element
-    auto J_e = J[e];
-    auto x_e = x[e];
-
-    //[[maybe_unused]] static constexpr trial_element_tuple trial_element_tuple{};
-    // batch-calculate values / derivatives of each trial space, at each quadrature point
-    [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when<indices == differentiation_index>(
-        get<indices>(trial_elements).interpolate(get<indices>(u)[elements[e]], rule))...};
-
-    // use J_e to transform values / derivatives on the parent element
-    // to the to the corresponding values / derivatives on the physical element
-    (parent_to_physical<get<indices>(trial_elements).family>(get<indices>(qf_inputs), J_e), ...);
-
-    // (batch) evalute the q-function at each quadrature point
-    //
-    // note: the weird immediately-invoked lambda expression is
-    // a workaround for a bug in GCC(<12.0) where it fails to
-    // decide which function overload to use, and crashes
-    auto qf_outputs = [&]() {
-      if constexpr (std::is_same_v<state_type, Nothing>) {
-        return batch_apply_qf_no_qdata(qf, t, x_e, J_e, get<indices>(qf_inputs)...);
-      } else {
-        return batch_apply_qf(qf, t, x_e, J_e, &qf_state(e, 0), update_state, get<indices>(qf_inputs)...);
-      }
-    }();
-
-    // use J to transform sources / fluxes on the physical element
-    // back to the corresponding sources / fluxes on the parent element
-    physical_to_parent<test_element::family>(qf_outputs, J_e);
-
-    // write out the q-function derivatives after applying the
-    // physical_to_parent transformation, so that those transformations
-    // won't need to be applied in the action_of_gradient and element_gradient kernels
-    if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
-      for (int q = 0; q < leading_dimension(qf_outputs); q++) {
-        qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]);
-      }
-    }
-
-    // (batch) integrate the material response against the test-space basis functions
-    test_element::integrate(get_value(qf_outputs), rule, &r[elements[e]]);
-  }
-
-  return;
+  // Type definitions for the intermediate data structures used during kernel evalutaion
+  using interpolate_out_type = decltype(tuple{get<indices>(trial_elements).template interpolate_output_helper<Q>()...});
+  using qf_inputs_type       = decltype(tuple{promote_each_to_dual_when_output_helper<indices == differentiation_index>(
+      get<indices>(trial_elements).template interpolate_output_helper<Q>())...});
+  using qf_outputs_type =
+      decltype(QFunctionOutput<lambda_type, state_type, dim, n, decltype(get<indices>(qf_inputs_type{}))...>::get_type(
+          qf));
+
+  // Determine allocation destination based on exec parameter
+  std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST";
+  auto&       rm          = umpire::ResourceManager::getInstance();
+  auto        allocator   = rm.getAllocator(device_name);
+  // Perform allocations
+  qf_inputs_type* qf_inputs = static_cast<qf_inputs_type*>(allocator.allocate(sizeof(qf_inputs_type) * num_elements));
+  interpolate_out_type* interpolate_result =
+      static_cast<interpolate_out_type*>(allocator.allocate(sizeof(interpolate_out_type) * num_elements));
+  qf_outputs_type* qf_outputs =
+      static_cast<qf_outputs_type*>(allocator.allocate(sizeof(qf_outputs_type) * num_elements));
+  // Zero out memory
+  rm.memset(qf_inputs, 0);
+  rm.memset(qf_outputs, 0);
+  rm.memset(interpolate_result, 0);
+
+  // Launch the execution kernel.  On GPU hardware this kernel dispatches one element per thread block and executes
+  // as many computations in shared memory as possible.
+  auto e_range = RAJA::TypedRangeSegment<uint32_t>(0, num_elements);
+  RAJA::launch<typename EvaluationSpacePolicy<exec>::launch_t>(
+      RAJA::LaunchParams(RAJA::Teams(static_cast<int>(num_elements)), RAJA::Threads(BLOCK_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<typename EvaluationSpacePolicy<exec>::teams_t>(
+            ctx, e_range,
+            // The explicit capture list is needed here because the capture occurs in a function template with a
+            // variadic non-type parameter.
+            [&ctx, t, J, x, u, trial_elements, qf, qpts_per_elem, rule, r, qf_state, elements, qf_derivatives,
+             qf_inputs, qf_outputs, interpolate_result, update_state](uint32_t e) {
+              detail::suppress_capture_warnings(qf_state, qf_derivatives, update_state, qpts_per_elem, trial_elements,
+                                                qf_inputs, interpolate_result, u);
+
+              [[maybe_unused]] static constexpr trial_element_tuple_type empty_trial_element{};
+              // batch-calculate values / derivatives of each trial space, at each quadrature point
+              (get<indices>(trial_elements)
+                   .interpolate(get<indices>(u)[elements[e]], rule, &get<indices>(interpolate_result[e]), ctx),
+               ...);
+              ctx.teamSync();
+
+              (promote_each_to_dual_when<indices == differentiation_index, exec>(get<indices>(interpolate_result[e]),
+                                                                                 &get<indices>(qf_inputs[e]), ctx),
+               ...);
+              ctx.teamSync();
+
+              // use J_e to transform values / derivatives on the parent element
+              // to the to the corresponding values / derivatives on the physical element
+              (parent_to_physical<get<indices>(empty_trial_element).family, exec>(get<indices>(qf_inputs[e]), J, e,
+                                                                                  ctx),
+               ...);
+              ctx.teamSync();
+
+              // (batch) evalute the q-function at each quadrature point
+              //
+              // note: the weird immediately-invoked lambda expression is
+              // a workaround for a bug in GCC(<12.0) where it fails to
+              // decide which function overload to use, and crashes
+              if constexpr (std::is_same_v<state_type, Nothing>) {
+                batch_apply_qf_no_qdata<exec>(qf, t, x[e], J[e], &(qf_outputs[e]), ctx, get<indices>(qf_inputs[e])...);
+              }
+
+              if constexpr (!std::is_same_v<state_type, Nothing>) {
+                batch_apply_qf<exec>(qf, t, x[e], J[e], &(qf_outputs[e]), &qf_state(e, 0), update_state, ctx,
+                                     get<indices>(qf_inputs[e])...);
+              }
+              ctx.teamSync();
+
+              // use J to transform sources / fluxes on the physical element
+              // back to the corresponding sources / fluxes on the parent element
+              physical_to_parent<test_element_type::family, exec>(qf_outputs[e], J, e, ctx);
+              ctx.teamSync();
+
+              // write out the q-function derivatives after applying the
+              // physical_to_parent transformation, so that those transformations
+              // won't need to be applied in the action_of_gradient and element_gradient kernels
+              if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) {
+                RAJA::RangeSegment x_range(0, leading_dimension(qf_outputs[e]));
+                RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, x_range, [&](int q) {
+                  qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[e][q]);
+                });
+              }
+              ctx.teamSync();
+
+              // (batch) integrate the material response against the test-space basis functions
+              test_element_type::integrate(get_value(qf_outputs[e]), rule, &r[elements[e]], ctx);
+              ctx.teamSync();
+            });
+      });
+
+  allocator.deallocate(qf_inputs);
+  allocator.deallocate(interpolate_result);
+  allocator.deallocate(qf_outputs);
 }
 
 //clang-format off
@@ -237,15 +326,14 @@ SERAC_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx)
 }
 //clang-format on
 
-template <bool is_QOI, typename derivative_type, int n, typename T>
-SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs)
+template <bool is_QOI, ExecutionSpace exec, typename derivative_type, int n, typename T>
+SERAC_HOST_DEVICE void batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor<T, n>& inputs,
+                                              tensor<decltype(chain_rule<is_QOI>(derivative_type{}, T{})), n>& outputs,
+                                              const RAJA::LaunchContext&                                       ctx)
 {
-  using return_type = decltype(chain_rule<is_QOI>(derivative_type{}, T{}));
-  tensor<return_type, n> outputs{};
-  for (int i = 0; i < n; i++) {
-    outputs[i] = chain_rule<is_QOI>(qf_derivatives[i], inputs[i]);
-  }
-  return outputs;
+  RAJA::RangeSegment i_range(0, n);
+  RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(
+      ctx, i_range, [&](int i) { outputs[i] = chain_rule<is_QOI>(qf_derivatives[i], inputs[i]); });
 }
 
 /**
@@ -274,33 +362,55 @@ SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, c
  * @param[in] num_elements The number of elements in the mesh
  */
 
-template <int Q, mfem::Geometry::Type g, typename test, typename trial, typename derivatives_type>
+template <int Q, mfem::Geometry::Type g, typename test, typename trial, ExecutionSpace exec, typename derivatives_type>
 void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* qf_derivatives, const int* elements,
                                std::size_t num_elements)
 {
-  using test_element  = finite_element<g, test>;
-  using trial_element = finite_element<g, trial>;
+  std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST";
+
+  using test_element   = finite_element<g, test, exec>;
+  using trial_element  = finite_element<g, trial, exec>;
+  using qf_inputs_type = decltype(trial_element::template interpolate_output_helper<Q>());
 
   constexpr bool is_QOI   = (test::family == Family::QOI);
   constexpr int  num_qpts = num_quadrature_points(g, Q);
 
-  // mfem provides this information in 1D arrays, so we reshape it
-  // into strided multidimensional arrays before using
+  // mfem provides this information in 1D arrays, so we reshape it into strided multidimensional arrays before using
   auto                                     du = reinterpret_cast<const typename trial_element::dof_type*>(dU);
   auto                                     dr = reinterpret_cast<typename test_element::dof_type*>(dR);
   constexpr TensorProductQuadratureRule<Q> rule{};
 
-  // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
-    // (batch) interpolate each quadrature point's value
-    auto qf_inputs = trial_element::interpolate(du[elements[e]], rule);
-
-    // (batch) evalute the q-function at each quadrature point
-    auto qf_outputs = batch_apply_chain_rule<is_QOI>(qf_derivatives + e * num_qpts, qf_inputs);
-
-    // (batch) integrate the material response against the test-space basis functions
-    test_element::integrate(qf_outputs, rule, &dr[elements[e]]);
-  }
+  // Determine allocation destination based on exec parameter
+  auto& rm = umpire::ResourceManager::getInstance();
+  // Perform allocations
+  auto            allocator = rm.getAllocator(device_name);
+  qf_inputs_type* qf_inputs = static_cast<qf_inputs_type*>(allocator.allocate(sizeof(qf_inputs_type) * num_elements));
+  // Zero out memory
+  rm.memset(qf_inputs, 0);
+
+  // This typedef is needed to declare qf_outputs in shared memory.
+  using chain_rule_dtype = decltype(chain_rule<is_QOI>(derivatives_type{}, typename trial_element::qf_input_type{}));
+  using qf_outputs_type  = tensor<chain_rule_dtype, num_qpts>;
+  // Launch the execution kernel.  On GPU hardware this kernel dispatches one element per thread block and executes
+  // as many computations in shared memory as possible.
+  auto e_range = RAJA::TypedRangeSegment<size_t>(0, num_elements);
+  RAJA::launch<typename EvaluationSpacePolicy<exec>::launch_t>(
+      RAJA::LaunchParams(RAJA::Teams(static_cast<int>(num_elements)), RAJA::Threads(BLOCK_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<typename EvaluationSpacePolicy<exec>::teams_t>(ctx, e_range, [&](int e) {
+          // (batch) interpolate each quadrature point's value
+          trial_element::interpolate(du[elements[e]], rule, &(qf_inputs[e]), ctx);
+          ctx.teamSync();
+
+          RAJA_TEAM_SHARED qf_outputs_type qf_outputs;
+          batch_apply_chain_rule<is_QOI, exec>(qf_derivatives + e * num_qpts, qf_inputs[e], qf_outputs, ctx);
+          ctx.teamSync();
+
+          test_element::integrate(qf_outputs, rule, &dr[elements[e]], ctx);
+          ctx.teamSync();
+        });
+      });
+  rm.deallocate(qf_inputs);
 }
 
 /**
@@ -324,76 +434,102 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q
  * @see mfem::GeometricFactors
  * @param[in] num_elements The number of elements in the mesh
  */
-template <mfem::Geometry::Type g, typename test, typename trial, int Q, typename derivatives_type>
-void element_gradient_kernel(ExecArrayView<double, 3, ExecutionSpace::CPU> dK, derivatives_type* qf_derivatives,
-                             const int* elements, std::size_t num_elements)
+template <mfem::Geometry::Type g, typename test, typename trial, int Q, ExecutionSpace exec, typename derivatives_type>
+void element_gradient_kernel(ExecArrayView<double, 3, exec> dK, derivatives_type* qf_derivatives, const int* elements,
+                             std::size_t num_elements)
 {
   // quantities of interest have no flux term, so we pad the derivative
   // tuple with a "zero" type in the second position to treat it like the standard case
-  constexpr bool is_QOI        = test::family == Family::QOI;
+  constexpr bool                           is_QOI      = test::family == Family::QOI;
+  constexpr int                            nquad       = num_quadrature_points(g, Q);
+  std::string                              device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST";
+  constexpr TensorProductQuadratureRule<Q> rule{};
   using padded_derivative_type = std::conditional_t<is_QOI, tuple<derivatives_type, zero>, derivatives_type>;
-
-  using test_element  = finite_element<g, test>;
-  using trial_element = finite_element<g, trial>;
-
-  constexpr int nquad = num_quadrature_points(g, Q);
-
-  static constexpr TensorProductQuadratureRule<Q> rule{};
-
-  // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
-    auto* output_ptr = reinterpret_cast<typename test_element::dof_type*>(&dK(elements[e], 0, 0));
-
-    tensor<padded_derivative_type, nquad> derivatives{};
-    for (int q = 0; q < nquad; q++) {
-      if constexpr (is_QOI) {
-        get<0>(derivatives(q)) = qf_derivatives[e * nquad + uint32_t(q)];
-      } else {
-        derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)];
-      }
-    }
-
-    for (int J = 0; J < trial_element::ndof; J++) {
-      auto source_and_flux = trial_element::batch_apply_shape_fn(J, derivatives, rule);
-      test_element::integrate(source_and_flux, rule, output_ptr + J, trial_element::ndof);
-    }
-  }
+  using test_element           = finite_element<g, test, exec>;
+  using trial_element          = finite_element<g, trial, exec>;
+
+  auto& rm        = umpire::ResourceManager::getInstance();
+  auto  allocator = rm.getAllocator(device_name);
+  // TODO(cuda): Use a dynamic shared memory buffer to avoid allocating device memory here.
+  tensor<padded_derivative_type, nquad>* derivatives = static_cast<tensor<padded_derivative_type, nquad>*>(
+      allocator.allocate(sizeof(tensor<padded_derivative_type, nquad>) * num_elements));
+
+  // Launch the execution kernel.  On GPU hardware this kernel dispatches one element per thread block and executes
+  // as many computations in shared memory as possible.
+  RAJA::TypedRangeSegment<size_t> elements_range(0, num_elements);
+  RAJA::launch<typename EvaluationSpacePolicy<exec>::launch_t>(
+      RAJA::LaunchParams(RAJA::Teams(static_cast<int>(num_elements)), RAJA::Threads(BLOCK_SZ)),
+      [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+        RAJA::loop<typename EvaluationSpacePolicy<exec>::teams_t>(ctx, elements_range, [&](uint32_t e) {
+          static constexpr bool  is_QOI_2 = test::family == Family::QOI;
+          [[maybe_unused]] auto* output_ptr =
+              reinterpret_cast<typename test_element::dof_type*>(&dK(elements[e], 0, 0));
+
+          RAJA::RangeSegment x_range(0, nquad);
+          RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(
+              ctx, x_range, [&derivatives, qf_derivatives, nquad, e](int q) {
+                detail::suppress_capture_warnings(nquad);
+                (void)nquad;
+                if constexpr (is_QOI_2) {
+                  get<0>(derivatives[e](q)) = qf_derivatives[e * nquad + uint32_t(q)];
+                }
+                if constexpr (!is_QOI_2) {
+                  derivatives[e](q) = qf_derivatives[e * nquad + uint32_t(q)];
+                }
+              });
+
+          ctx.teamSync();
+
+          RAJA_TEAM_SHARED typename trial_element::template batch_apply_shape_fn_output<padded_derivative_type, Q>::type
+              source_and_flux;
+          for (int J = 0; J < trial_element::ndof; ++J) {
+            trial_element::batch_apply_shape_fn(J, derivatives[e], &source_and_flux, rule, ctx);
+            ctx.teamSync();
+
+            test_element::integrate(source_and_flux, rule, output_ptr + J, ctx, trial_element::ndof);
+            ctx.teamSync();
+          }
+        });
+      });
+  rm.deallocate(derivatives);
 }
 
-template <uint32_t wrt, int Q, mfem::Geometry::Type geom, typename signature, typename lambda_type, typename state_type,
-          typename derivative_type>
+template <uint32_t wrt, int Q, mfem::Geometry::Type geom, ExecutionSpace exec, typename signature, typename lambda_type,
+          typename state_type, typename derivative_type>
 auto evaluation_kernel(signature s, const lambda_type& qf, const double* positions, const double* jacobians,
                        std::shared_ptr<QuadratureData<state_type>> qf_state,
                        std::shared_ptr<derivative_type> qf_derivatives, const int* elements, uint32_t num_elements)
 {
-  auto trial_elements = trial_elements_tuple<geom>(s);
-  auto test_element   = get_test_element<geom>(s);
+  auto trial_element_tuple = trial_elements_tuple<geom, exec>(s);
+  auto test_element        = get_test_element<geom, exec>(s);
   return [=](double time, const std::vector<const double*>& inputs, double* outputs, bool update_state) {
-    domain_integral::evaluation_kernel_impl<wrt, Q, geom>(
-        trial_elements, test_element, time, inputs, outputs, positions, jacobians, qf, (*qf_state)[geom],
+    domain_integral::evaluation_kernel_impl<wrt, Q, geom, exec>(
+        trial_element_tuple, test_element, time, inputs, outputs, positions, jacobians, qf, (*qf_state)[geom],
         qf_derivatives.get(), elements, num_elements, update_state, s.index_seq);
   };
 }
 
-template <int wrt, int Q, mfem::Geometry::Type geom, typename signature, typename derivative_type>
+template <int wrt, int Q, mfem::Geometry::Type geom, ExecutionSpace exec, typename signature, typename derivative_type>
 std::function<void(const double*, double*)> jacobian_vector_product_kernel(
     signature, std::shared_ptr<derivative_type> qf_derivatives, const int* elements, uint32_t num_elements)
 {
   return [=](const double* du, double* dr) {
     using test_space  = typename signature::return_type;
     using trial_space = typename std::tuple_element<wrt, typename signature::parameter_types>::type;
-    action_of_gradient_kernel<Q, geom, test_space, trial_space>(du, dr, qf_derivatives.get(), elements, num_elements);
+    action_of_gradient_kernel<Q, geom, test_space, trial_space, exec>(du, dr, qf_derivatives.get(), elements,
+                                                                      num_elements);
   };
 }
 
-template <int wrt, int Q, mfem::Geometry::Type geom, typename signature, typename derivative_type>
-std::function<void(ExecArrayView<double, 3, ExecutionSpace::CPU>)> element_gradient_kernel(
+template <int wrt, int Q, mfem::Geometry::Type geom, ExecutionSpace exec, typename signature, typename derivative_type>
+std::function<void(ExecArrayView<double, 3, exec>)> element_gradient_kernel(
     signature, std::shared_ptr<derivative_type> qf_derivatives, const int* elements, uint32_t num_elements)
 {
-  return [=](ExecArrayView<double, 3, ExecutionSpace::CPU> K_elem) {
+  return [=](ExecArrayView<double, 3, exec> K_elem) {
     using test_space  = typename signature::return_type;
     using trial_space = typename std::tuple_element<wrt, typename signature::parameter_types>::type;
-    element_gradient_kernel<geom, test_space, trial_space, Q>(K_elem, qf_derivatives.get(), elements, num_elements);
+    element_gradient_kernel<geom, test_space, trial_space, Q, exec>(K_elem, qf_derivatives.get(), elements,
+                                                                    num_elements);
   };
 }
 
diff --git a/src/serac/numerics/functional/element_restriction.cpp b/src/serac/numerics/functional/element_restriction.cpp
index 9068da405..4fe638f35 100644
--- a/src/serac/numerics/functional/element_restriction.cpp
+++ b/src/serac/numerics/functional/element_restriction.cpp
@@ -1,10 +1,18 @@
+// Copyright (c) 2019-2023, Lawrence Livermore National Security, LLC and
+// other Serac Project Developers. See the top-level LICENSE file for
+// details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+
+#include "serac/serac_config.hpp"
+#include "serac/infrastructure/accelerator.hpp"
 #include "serac/numerics/functional/element_restriction.hpp"
 
 #include "mfem.hpp"
 
 #include "serac/numerics/functional/geometry.hpp"
 
-std::vector<std::vector<int> > lexicographic_permutations(int p)
+std::vector<std::vector<int>> lexicographic_permutations(int p)
 {
   // p == 0 is admissible for L2 spaces, but lexicographic permutations
   // aren't needed in that corner case
@@ -12,7 +20,7 @@ std::vector<std::vector<int> > lexicographic_permutations(int p)
     return {};
   }
 
-  std::vector<std::vector<int> > output(mfem::Geometry::Type::NUM_GEOMETRIES);
+  std::vector<std::vector<int>> output(mfem::Geometry::Type::NUM_GEOMETRIES);
 
   {
     auto             P = mfem::H1_SegmentElement(p).GetLexicographicOrdering();
@@ -126,7 +134,7 @@ Array2D<int> face_permutations(mfem::Geometry::Type geom, int p)
   exit(1);
 }
 
-std::vector<Array2D<int> > geom_local_face_dofs(int p)
+std::vector<Array2D<int>> geom_local_face_dofs(int p)
 {
   // FullSimplify[InterpolatingPolynomial[{
   //   {{0, 2}, (p + 1) + p},
@@ -153,7 +161,7 @@ std::vector<Array2D<int> > geom_local_face_dofs(int p)
 
   auto hex_id = [p](int x, int y, int z) { return (p + 1) * ((p + 1) * z + y) + x; };
 
-  std::vector<Array2D<int> > output(mfem::Geometry::Type::NUM_GEOMETRIES);
+  std::vector<Array2D<int>> output(mfem::Geometry::Type::NUM_GEOMETRIES);
 
   Array2D<int> tris(3, p + 1);
   for (int k = 0; k <= p; k++) {
@@ -220,8 +228,8 @@ axom::Array<DoF, 2, axom::MemorySpace::Host> GetElementRestriction(const mfem::F
   mfem::Mesh*      mesh = fes->GetMesh();
 
   // note: this assumes that all the elements are the same polynomial order
-  int                            p        = fes->GetElementOrder(0);
-  std::vector<std::vector<int> > lex_perm = lexicographic_permutations(p);
+  int                           p        = fes->GetElementOrder(0);
+  std::vector<std::vector<int>> lex_perm = lexicographic_permutations(p);
 
   uint64_t n = 0;
 
@@ -284,10 +292,10 @@ axom::Array<DoF, 2, axom::MemorySpace::Host> GetFaceDofs(const mfem::FiniteEleme
   mfem::Table*     face_to_elem = mesh->GetFaceToElementTable();
 
   // note: this assumes that all the elements are the same polynomial order
-  int                            p               = fes->GetElementOrder(0);
-  Array2D<int>                   face_perm       = face_permutations(face_geom, p);
-  std::vector<Array2D<int> >     local_face_dofs = geom_local_face_dofs(p);
-  std::vector<std::vector<int> > lex_perm        = lexicographic_permutations(p);
+  int                           p               = fes->GetElementOrder(0);
+  Array2D<int>                  face_perm       = face_permutations(face_geom, p);
+  std::vector<Array2D<int>>     local_face_dofs = geom_local_face_dofs(p);
+  std::vector<std::vector<int>> lex_perm        = lexicographic_permutations(p);
 
   uint64_t n = 0;
 
@@ -440,28 +448,11 @@ void ElementRestriction::GetElementVDofs(int i, std::vector<DoF>& vdofs) const
   }
 }
 
-void ElementRestriction::Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const
+void ElementRestriction::GetElementVDofs(int i, DoF* vdofs) const
 {
-  for (uint64_t i = 0; i < num_elements; i++) {
-    for (uint64_t c = 0; c < components; c++) {
-      for (uint64_t j = 0; j < nodes_per_elem; j++) {
-        uint64_t E_id       = (i * components + c) * nodes_per_elem + j;
-        uint64_t L_id       = GetVDof(dof_info(i, j), c).index();
-        E_vector[int(E_id)] = L_vector[int(L_id)];
-      }
-    }
-  }
-}
-
-void ElementRestriction::ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const
-{
-  for (uint64_t i = 0; i < num_elements; i++) {
-    for (uint64_t c = 0; c < components; c++) {
-      for (uint64_t j = 0; j < nodes_per_elem; j++) {
-        uint64_t E_id = (i * components + c) * nodes_per_elem + j;
-        uint64_t L_id = GetVDof(dof_info(i, j), c).index();
-        L_vector[int(L_id)] += E_vector[int(E_id)];
-      }
+  for (uint64_t c = 0; c < components; c++) {
+    for (uint64_t j = 0; j < nodes_per_elem; j++) {
+      vdofs[c * nodes_per_elem + j] = GetVDof(dof_info(i, j), c);
     }
   }
 }
@@ -521,18 +512,4 @@ mfem::Array<int> BlockElementRestriction::bOffsets() const
   return offsets;
 };
 
-void BlockElementRestriction::Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const
-{
-  for (auto [geom, restriction] : restrictions) {
-    restriction.Gather(L_vector, E_block_vector.GetBlock(geom));
-  }
-}
-
-void BlockElementRestriction::ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const
-{
-  for (auto [geom, restriction] : restrictions) {
-    restriction.ScatterAdd(E_block_vector.GetBlock(geom), L_vector);
-  }
-}
-
 }  // namespace serac
diff --git a/src/serac/numerics/functional/element_restriction.hpp b/src/serac/numerics/functional/element_restriction.hpp
index 7f12dca59..796311075 100644
--- a/src/serac/numerics/functional/element_restriction.hpp
+++ b/src/serac/numerics/functional/element_restriction.hpp
@@ -5,6 +5,7 @@
 #include "mfem.hpp"
 #include "axom/core.hpp"
 #include "geometry.hpp"
+#include "serac/infrastructure/accelerator.hpp"
 
 inline bool isH1(const mfem::FiniteElementSpace& fes)
 {
@@ -55,28 +56,29 @@ struct DoF {
   uint64_t bits;
 
   /// default ctor
-  DoF() : bits{} {}
+  SERAC_HOST_DEVICE DoF() : bits{} {}
 
   /// copy ctor
-  DoF(const DoF& other) : bits{other.bits} {}
+  SERAC_HOST_DEVICE DoF(const DoF& other) : bits{other.bits} {}
 
   /// create a `DoF` from the given index, sign and orientation values
-  DoF(uint64_t index, uint64_t sign = 0, uint64_t orientation = 0)
+  SERAC_HOST_DEVICE DoF(uint64_t index, uint64_t sign = 0, uint64_t orientation = 0)
       : bits((sign & 0x1ULL << sign_shift) + ((orientation & 0x7ULL) << orientation_shift) + index)
   {
   }
 
   /// copy assignment operator
-  void operator=(const DoF& other) { bits = other.bits; }
+  SERAC_HOST_DEVICE void operator=(const DoF& other) { bits = other.bits; }
 
   /// get the sign field of this `DoF`
-  int sign() const { return (bits & sign_mask) ? -1 : 1; }
+  SERAC_HOST_DEVICE int sign() const { return (bits & sign_mask) ? -1 : 1; }
 
   /// get the orientation field of this `DoF`
-  uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); }
+  SERAC_HOST_DEVICE uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); }
 
   /// get the index field of this `DoF`
-  uint64_t index() const { return (bits & index_mask); }
+
+  SERAC_HOST_DEVICE uint64_t index() const { return (bits & index_mask); }
 };
 
 /// a small struct used to enable range-based for loops in `Array2D`
@@ -167,14 +169,95 @@ struct ElementRestriction {
    */
   void GetElementVDofs(int i, std::vector<DoF>& dofs) const;
 
+  /**
+   * @brief Overload for device code.
+   *
+   * @param i the index of the element
+   * @param vdofs (output) the DoFs associated with element `i`
+   */
+  void GetElementVDofs(int i, DoF* vdofs) const;
+
   /// get the dof information for a given node / component
+
   DoF GetVDof(DoF node, uint64_t component) const;
 
+  /**
+   * @brief Static version of GetVDof for device code.
+   *
+   * @param node to get vdofs
+   * @param ordering_ elemtn ordering type to be provided from ElementRestriction struct
+   * @param component component idx to be provided from ElementRestriction struct
+   * @param num_nodes_ number of nodes in ElementRestriction
+   * @param components_ ordering to be provided from ElementRestriction struct
+   */
+  SERAC_HOST_DEVICE static DoF GetVDofDevice(DoF node, mfem::Ordering::Type ordering_, uint64_t component,
+                                             uint64_t num_nodes_, uint64_t components_)
+  {
+    if (ordering_ == mfem::Ordering::Type::byNODES) {
+      return DoF{component * num_nodes_ + node.index(), (node.sign() == 1) ? 0ull : 1ull, node.orientation()};
+    } else {
+      return DoF{node.index() * components_ + component, (node.sign() == 1) ? 0ull : 1ull, node.orientation()};
+    }
+  }
+
   /// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector"
-  void Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const;
+  template <ExecutionSpace exec = ExecutionSpace::CPU>
+  void Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const
+  {
+    auto l_ptr = L_vector.Read();
+    auto e_ptr = E_vector.ReadWrite();
+
+    uint64_t                           d_components     = components;
+    uint64_t                           d_nodes_per_elem = nodes_per_elem;
+    uint64_t                           d_num_nodes      = num_nodes;
+    auto                               d_ordering       = ordering;
+    constexpr axom::MemorySpace        mem_space        = detail::execution_to_memory<exec>::value;
+    axom::Array<DoF, 2, mem_space>     tmp              = dof_info;
+    axom::ArrayView<DoF, 2, mem_space> d_dof_info       = tmp;
+
+    RAJA::forall<typename EvaluationSpacePolicy<exec>::forall_t>(
+        RAJA::TypedRangeSegment<uint64_t>(0, num_elements), [d_components, d_nodes_per_elem, d_num_nodes, d_ordering,
+                                                             e_ptr, l_ptr, d_dof_info] SERAC_HOST_DEVICE(uint64_t i) {
+          for (uint64_t c = 0; c < d_components; c++) {
+            for (uint64_t j = 0; j < d_nodes_per_elem; j++) {
+              uint64_t E_id    = (i * d_components + c) * d_nodes_per_elem + j;
+              uint64_t L_id    = GetVDofDevice(d_dof_info(i, j), d_ordering, c, d_num_nodes, d_components).index();
+              e_ptr[int(E_id)] = l_ptr[int(L_id)];
+            }
+          }
+        });
+  }
 
   /// "E->L" in mfem parlance, each element scatter-adds its local vector into the appropriate place in the "L-vector"
-  void ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const;
+  template <ExecutionSpace exec>
+  void ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const
+  {
+    auto l_ptr = L_vector.ReadWrite(true);
+    auto e_ptr = E_vector.Read(true);
+
+    uint64_t                           d_components     = components;
+    uint64_t                           d_nodes_per_elem = nodes_per_elem;
+    uint64_t                           d_num_nodes      = num_nodes;
+    uint64_t                           d_num_elements   = num_elements;
+    auto                               d_ordering       = ordering;
+    constexpr axom::MemorySpace        mem_space        = detail::execution_to_memory<exec>::value;
+    axom::Array<DoF, 2, mem_space>     tmp              = dof_info;
+    axom::ArrayView<DoF, 2, mem_space> d_dof_info       = tmp;
+
+    RAJA::forall<typename EvaluationSpacePolicy<exec>::forall_t>(
+        RAJA::TypedRangeSegment<uint64_t>(0, 1), [d_components, d_nodes_per_elem, d_num_elements, d_num_nodes,
+                                                  d_ordering, e_ptr, l_ptr, d_dof_info] SERAC_HOST_DEVICE(uint64_t) {
+          for (uint64_t i = 0; i < d_num_elements; ++i) {
+            for (uint64_t c = 0; c < d_components; c++) {
+              for (uint64_t j = 0; j < d_nodes_per_elem; j++) {
+                uint64_t E_id = (i * d_components + c) * d_nodes_per_elem + j;
+                uint64_t L_id = GetVDofDevice(d_dof_info(i, j), d_ordering, c, d_num_nodes, d_components).index();
+                l_ptr[int(L_id)] += e_ptr[int(E_id)];
+              }
+            }
+          }
+        });
+  }
 
   /// the size of the "E-vector"
   uint64_t esize;
@@ -206,6 +289,7 @@ struct ElementRestriction {
  * Instead of doing the "E->L" (gather) and "L->E" (scatter) operations for only one element geometry, this
  * class does them with block "E-vectors", where each element geometry is a separate block.
  */
+// template<ExecutionSpace exec = ExecutionSpace::CPU>
 struct BlockElementRestriction {
   /// default ctor leaves this object uninitialized
   BlockElementRestriction() {}
@@ -226,10 +310,22 @@ struct BlockElementRestriction {
   mfem::Array<int> bOffsets() const;
 
   /// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector"
-  void Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const;
+  template <ExecutionSpace exec>
+  void Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const
+  {
+    for (auto [geom, restriction] : restrictions) {
+      restriction.template Gather<exec>(L_vector, E_block_vector.GetBlock(geom));
+    }
+  }
 
   /// "E->L" in mfem parlance, each element scatter-adds its local vector into the appropriate place in the "L-vector"
-  void ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const;
+  template <ExecutionSpace exec>
+  void ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const
+  {
+    for (auto [geom, restriction] : restrictions) {
+      restriction.template ScatterAdd<exec>(E_block_vector.GetBlock(geom), L_vector);
+    }
+  }
 
   /// the individual ElementRestriction operators for each element geometry
   std::map<mfem::Geometry::Type, ElementRestriction> restrictions;
diff --git a/src/serac/numerics/functional/finite_element.hpp b/src/serac/numerics/functional/finite_element.hpp
index 12b87f684..d0d4e88fd 100644
--- a/src/serac/numerics/functional/finite_element.hpp
+++ b/src/serac/numerics/functional/finite_element.hpp
@@ -12,10 +12,13 @@
  */
 #pragma once
 
+#include "serac/infrastructure/accelerator.hpp"
+#include "serac/serac_config.hpp"
 #include "tuple.hpp"
 #include "tensor.hpp"
 #include "geometry.hpp"
 #include "polynomials.hpp"
+#include "RAJA/RAJA.hpp"
 
 namespace serac {
 
@@ -40,10 +43,6 @@ struct TensorProductQuadratureRule {
   }
 };
 
-template <auto val>
-struct CompileTimeValue {
-};
-
 /**
  * @brief this struct is used to look up mfem's memory layout of
  * the quadrature point jacobian matrices
@@ -241,19 +240,28 @@ struct QOI {
  * @tparam q how many values need to be transformed
  * @tparam dim the spatial dimension
  * @param qf_input the values to be transformed from parent to physical space
- * @param jacobians the jacobians of the isoparametric map from parent to physical space of each quadrature point
+ * @param jacobians array of the jacobians of the isoparametric map from parent to physical space of each quadrature
+ * point
+ * @param block_idx index into the array of jacobians
+ * @param ctx the RAJA launch context used to synchronize threads and required by the RAJA API.
  */
-template <Family f, typename T, int q, int dim>
-SERAC_HOST_DEVICE void parent_to_physical(tensor<T, q>& qf_input, const tensor<double, dim, dim, q>& jacobians)
+template <Family f, ExecutionSpace exec, typename T, int q, int dim>
+SERAC_HOST_DEVICE void parent_to_physical(tensor<T, q>& qf_input, const tensor<double, dim, dim, q>* jacobians,
+                                          uint32_t block_idx, RAJA::LaunchContext ctx)
 {
   [[maybe_unused]] constexpr int VALUE      = 0;
   [[maybe_unused]] constexpr int DERIVATIVE = 1;
 
-  for (int k = 0; k < q; k++) {
+  RAJA::RangeSegment k_range(0, BLOCK_SZ);
+
+  RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, k_range, [&](int k) {
+    if (k >= q) {
+      return;
+    }
     tensor<double, dim, dim> J;
     for (int row = 0; row < dim; row++) {
       for (int col = 0; col < dim; col++) {
-        J[row][col] = jacobians(col, row, k);
+        J[row][col] = jacobians[block_idx](col, row, k);
       }
     }
 
@@ -269,7 +277,7 @@ SERAC_HOST_DEVICE void parent_to_physical(tensor<T, q>& qf_input, const tensor<d
         get<DERIVATIVE>(qf_input[k]) = dot(get<DERIVATIVE>(qf_input[k]), transpose(J));
       }
     }
-  }
+  });
 }
 
 /**
@@ -282,19 +290,27 @@ SERAC_HOST_DEVICE void parent_to_physical(tensor<T, q>& qf_input, const tensor<d
  * @tparam q how many values need to be transformed
  * @tparam dim the spatial dimension
  * @param qf_output the values to be transformed from physical back to parent space
- * @param jacobians the jacobians of the isoparametric map from parent to physical space of each quadrature point
+ * @param jacobians array of the jacobians of the isoparametric map from parent to physical space of each quadrature
+ * point
+ * @param block_idx index into the array of jacobians
+ * @param ctx the RAJA launch context used to synchronize threads and required by the RAJA API.
  */
-template <Family f, typename T, int q, int dim>
-SERAC_HOST_DEVICE void physical_to_parent(tensor<T, q>& qf_output, const tensor<double, dim, dim, q>& jacobians)
+template <Family f, ExecutionSpace exec, typename T, int q, int dim>
+SERAC_HOST_DEVICE void physical_to_parent(tensor<T, q>& qf_output, const tensor<double, dim, dim, q>* jacobians,
+                                          uint32_t block_idx, RAJA::LaunchContext ctx)
 {
   [[maybe_unused]] constexpr int SOURCE = 0;
   [[maybe_unused]] constexpr int FLUX   = 1;
 
-  for (int k = 0; k < q; k++) {
+  RAJA::RangeSegment k_range(0, BLOCK_SZ);
+  RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, k_range, [&](int k) {
+    if (k >= q) {
+      return;
+    }
     tensor<double, dim, dim> J_T;
     for (int row = 0; row < dim; row++) {
       for (int col = 0; col < dim; col++) {
-        J_T[row][col] = jacobians(row, col, k);
+        J_T[row][col] = jacobians[block_idx](row, col, k);
       }
     }
 
@@ -318,7 +334,7 @@ SERAC_HOST_DEVICE void physical_to_parent(tensor<T, q>& qf_output, const tensor<
     if constexpr (f == Family::QOI) {
       qf_output[k] = qf_output[k] * dv;
     }
-  }
+  });
 }
 
 /**
@@ -346,7 +362,7 @@ SERAC_HOST_DEVICE void physical_to_parent(tensor<T, q>& qf_output, const tensor<
  * };
  *
  */
-template <mfem::Geometry::Type g, typename family>
+template <mfem::Geometry::Type g, typename family, serac::ExecutionSpace exec = ExecutionSpace::CPU>
 struct finite_element;
 
 #include "detail/segment_H1.inl"
diff --git a/src/serac/numerics/functional/function_signature.hpp b/src/serac/numerics/functional/function_signature.hpp
index e5ce5d9bb..628090f07 100644
--- a/src/serac/numerics/functional/function_signature.hpp
+++ b/src/serac/numerics/functional/function_signature.hpp
@@ -1,5 +1,14 @@
+// Copyright (c) 2019-2023, Lawrence Livermore National Security, LLC and
+// other Serac Project Developers. See the top-level LICENSE file for
+// details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+
 #pragma once
-#include <camp/camp.hpp>
+#include "camp/camp.hpp"
+
+#include "serac/infrastructure/accelerator.hpp"
+#include "serac/serac_config.hpp"
 #include "serac/numerics/functional/finite_element.hpp"
 
 template <typename T>
@@ -32,10 +41,10 @@ struct FunctionSignature<output_type(input_types...)> {
  * template parameters in evaluation_kernel_impl.  See section 14.7.3 of the
  * CUDA programming guide, item 9.
  */
-template <mfem::Geometry::Type geom, typename test, typename... trials>
+template <mfem::Geometry::Type geom, serac::ExecutionSpace exec, typename test, typename... trials>
 auto trial_elements_tuple(FunctionSignature<test(trials...)>)
 {
-  return serac::tuple<serac::finite_element<geom, trials>...>{};
+  return serac::tuple<serac::finite_element<geom, trials, exec>...>{};
 }
 
 /**
@@ -45,8 +54,8 @@ auto trial_elements_tuple(FunctionSignature<test(trials...)>)
  * See the comments for trial_elements_tuple to understand why this is needed in
  * domain_integral_kernels::evaluation_kernel and boundary_integral_kernels::evaluation_kernel.
  */
-template <mfem::Geometry::Type geom, typename test, typename... trials>
+template <mfem::Geometry::Type geom, serac::ExecutionSpace exec, typename test, typename... trials>
 auto get_test_element(FunctionSignature<test(trials...)>)
 {
-  return serac::finite_element<geom, test>{};
+  return serac::finite_element<geom, test, exec>{};
 }
diff --git a/src/serac/numerics/functional/functional.hpp b/src/serac/numerics/functional/functional.hpp
index 402cab9b3..cb713fa45 100644
--- a/src/serac/numerics/functional/functional.hpp
+++ b/src/serac/numerics/functional/functional.hpp
@@ -14,6 +14,7 @@
 
 #include "mfem.hpp"
 
+#include "serac/infrastructure/accelerator.hpp"
 #include "serac/serac_config.hpp"
 #include "serac/infrastructure/logger.hpp"
 #include "serac/numerics/functional/tensor.hpp"
@@ -27,7 +28,10 @@
 
 #include "serac/numerics/functional/domain.hpp"
 
+#include <RAJA/index/RangeSegment.hpp>
+#include <RAJA/pattern/forall.hpp>
 #include <array>
+#include <cstdint>
 #include <vector>
 
 namespace serac {
@@ -142,7 +146,7 @@ generateParFiniteElementSpace(mfem::ParMesh* mesh)
 }
 
 /// @cond
-template <typename T, ExecutionSpace exec = serac::default_execution_space>
+template <typename T, ExecutionSpace exec = serac::ExecutionSpace::CPU>
 class Functional;
 /// @endcond
 
@@ -295,8 +299,8 @@ class Functional<test(trials...), exec> {
     check_for_missing_nodal_gridfunc(domain);
 
     using signature = test(decltype(serac::type<args>(trial_spaces))...);
-    integrals_.push_back(
-        MakeDomainIntegral<signature, Q, dim>(EntireDomain(domain), integrand, qdata, std::vector<uint32_t>{args...}));
+    integrals_.push_back(MakeDomainIntegral<signature, Q, dim, exec>(EntireDomain(domain), integrand, qdata,
+                                                                     std::vector<uint32_t>{args...}));
   }
 
   /// @overload
@@ -312,8 +316,8 @@ class Functional<test(trials...), exec> {
     check_for_missing_nodal_gridfunc(domain.mesh_);
 
     using signature = test(decltype(serac::type<args>(trial_spaces))...);
-    integrals_.push_back(
-        MakeDomainIntegral<signature, Q, dim>(domain, integrand, qdata, std::vector<uint32_t>{args...}));
+    integrals_.push_back(MakeDomainIntegral<signature, Q, dim, exec>(std::move(domain), integrand, qdata,
+                                                                     std::vector<uint32_t>{args...}));
   }
 
   /**
@@ -334,8 +338,8 @@ class Functional<test(trials...), exec> {
     check_for_missing_nodal_gridfunc(domain);
 
     using signature = test(decltype(serac::type<args>(trial_spaces))...);
-    integrals_.push_back(
-        MakeBoundaryIntegral<signature, Q, dim>(EntireBoundary(domain), integrand, std::vector<uint32_t>{args...}));
+    integrals_.push_back(MakeBoundaryIntegral<signature, Q, dim, exec>(EntireBoundary(domain), integrand,
+                                                                       std::vector<uint32_t>{args...}));
   }
 
   /// @overload
@@ -350,7 +354,8 @@ class Functional<test(trials...), exec> {
     check_for_missing_nodal_gridfunc(domain.mesh_);
 
     using signature = test(decltype(serac::type<args>(trial_spaces))...);
-    integrals_.push_back(MakeBoundaryIntegral<signature, Q, dim>(domain, integrand, std::vector<uint32_t>{args...}));
+    integrals_.push_back(
+        MakeBoundaryIntegral<signature, Q, dim, exec>(std::move(domain), integrand, std::vector<uint32_t>{args...}));
   }
 
   /**
@@ -417,14 +422,14 @@ class Functional<test(trials...), exec> {
       auto type = integral.domain_.type_;
 
       if (!already_computed[type]) {
-        G_trial_[type][which].Gather(input_L_[which], input_E_[type][which]);
+        (G_trial_[type][which]).template Gather<exec>(input_L_[which], input_E_[type][which]);
         already_computed[type] = true;
       }
 
       integral.GradientMult(input_E_[type][which], output_E_[type], which);
 
       // scatter-add to compute residuals on the local processor
-      G_test_[type].ScatterAdd(output_E_[type], output_L_);
+      G_test_[type].template ScatterAdd<exec>(output_E_[type], output_L_);
     }
 
     // scatter-add to compute global residuals
@@ -464,7 +469,7 @@ class Functional<test(trials...), exec> {
 
       for (auto i : integral.active_trial_spaces_) {
         if (!already_computed[type][i]) {
-          G_trial_[type][i].Gather(input_L_[i], input_E_[type][i]);
+          G_trial_[type][i].template Gather<exec>(input_L_[i], input_E_[type][i]);
           already_computed[type][i] = true;
         }
       }
@@ -472,7 +477,7 @@ class Functional<test(trials...), exec> {
       integral.Mult(t, input_E_[type], output_E_[type], wrt, update_qdata_);
 
       // scatter-add to compute residuals on the local processor
-      G_test_[type].ScatterAdd(output_E_[type], output_L_);
+      G_test_[type].template ScatterAdd<exec>(output_E_[type], output_L_);
     }
 
     // scatter-add to compute global residuals
@@ -616,7 +621,7 @@ class Functional<test(trials...), exec> {
           for (auto [geom, elem_matrices] : K_elem) {
             std::vector<DoF> test_vdofs(test_restrictions[geom].nodes_per_elem * test_restrictions[geom].components);
             std::vector<DoF> trial_vdofs(trial_restrictions[geom].nodes_per_elem * trial_restrictions[geom].components);
-
+            axom::Array<double, 3, axom::MemorySpace::Host> elem_matrices_host = elem_matrices;
             for (axom::IndexType e = 0; e < elem_matrices.shape()[0]; e++) {
               test_restrictions[geom].GetElementVDofs(e, test_vdofs);
               trial_restrictions[geom].GetElementVDofs(e, trial_vdofs);
@@ -634,8 +639,7 @@ class Functional<test(trials...), exec> {
                   //
                   //       This is kind of confusing, and will be fixed in a future refactor
                   //       of the element gradient kernel implementation
-                  [[maybe_unused]] auto nz = lookup_tables(row, col);
-                  values[lookup_tables(row, col)] += sign * elem_matrices(e, i, j);
+                  values[lookup_tables(row, col)] += sign * elem_matrices_host(e, i, j);
                 }
               }
             }
@@ -726,7 +730,7 @@ class Functional<test(trials...), exec> {
 
   mutable std::vector<mfem::BlockVector> input_E_[Domain::num_types];
 
-  std::vector<Integral> integrals_;
+  std::vector<Integral<exec>> integrals_;
 
   mutable mfem::BlockVector output_E_[Domain::num_types];
 
diff --git a/src/serac/numerics/functional/functional_qoi.inl b/src/serac/numerics/functional/functional_qoi.inl
index 2c0cea577..79a46f1dd 100644
--- a/src/serac/numerics/functional/functional_qoi.inl
+++ b/src/serac/numerics/functional/functional_qoi.inl
@@ -168,8 +168,8 @@ public:
     check_for_missing_nodal_gridfunc(mesh);
 
     using signature = test(decltype(serac::type<args>(trial_spaces))...);
-    integrals_.push_back(
-        MakeDomainIntegral<signature, Q, dim>(EntireDomain(mesh), integrand, qdata, std::vector<uint32_t>{args...}));
+    integrals_.push_back(MakeDomainIntegral<signature, Q, dim, exec>(EntireDomain(mesh), integrand, qdata,
+                                                                     std::vector<uint32_t>{args...}));
   }
 
   /// @overload
@@ -186,7 +186,7 @@ public:
 
     using signature = test(decltype(serac::type<args>(trial_spaces))...);
     integrals_.push_back(
-        MakeDomainIntegral<signature, Q, dim>(domain, integrand, qdata, std::vector<uint32_t>{args...}));
+        MakeDomainIntegral<signature, Q, dim, exec>(domain, integrand, qdata, std::vector<uint32_t>{args...}));
   }
 
   /**
@@ -210,7 +210,7 @@ public:
 
     using signature = test(decltype(serac::type<args>(trial_spaces))...);
     integrals_.push_back(
-        MakeBoundaryIntegral<signature, Q, dim>(EntireBoundary(mesh), integrand, std::vector<uint32_t>{args...}));
+        MakeBoundaryIntegral<signature, Q, dim, exec>(EntireBoundary(mesh), integrand, std::vector<uint32_t>{args...}));
   }
 
   /// @overload
@@ -225,7 +225,8 @@ public:
     check_for_missing_nodal_gridfunc(domain.mesh_);
 
     using signature = test(decltype(serac::type<args>(trial_spaces))...);
-    integrals_.push_back(MakeBoundaryIntegral<signature, Q, dim>(domain, integrand, std::vector<uint32_t>{args...}));
+    integrals_.push_back(
+        MakeBoundaryIntegral<signature, Q, dim, exec>(domain, integrand, std::vector<uint32_t>{args...}));
   }
 
   /**
@@ -293,7 +294,7 @@ public:
       auto type = integral.domain_.type_;
 
       if (!already_computed[type]) {
-        G_trial_[type][which].Gather(input_L_[which], input_E_[type][which]);
+        G_trial_[type][which].template Gather<exec>(input_L_[which], input_E_[type][which]);
         already_computed[type] = true;
       }
 
@@ -340,7 +341,7 @@ public:
 
       for (auto i : integral.active_trial_spaces_) {
         if (!already_computed[type][i]) {
-          G_trial_[type][i].Gather(input_L_[i], input_E_[type][i]);
+          G_trial_[type][i].template Gather<exec>(input_L_[i], input_E_[type][i]);
           already_computed[type][i] = true;
         }
       }
@@ -507,7 +508,7 @@ private:
 
   mutable std::vector<mfem::BlockVector> input_E_[Domain::num_types];
 
-  std::vector<Integral> integrals_;
+  std::vector<Integral<exec>> integrals_;
 
   mutable mfem::BlockVector output_E_[Domain::num_types];
 
diff --git a/src/serac/numerics/functional/geometric_factors.cpp b/src/serac/numerics/functional/geometric_factors.cpp
deleted file mode 100644
index 7e8897fa5..000000000
--- a/src/serac/numerics/functional/geometric_factors.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-#include "serac/numerics/functional/geometric_factors.hpp"
-#include "serac/numerics/functional/finite_element.hpp"
-
-namespace serac {
-
-/**
- * @brief a kernel to compute the positions and jacobians at each quadrature point (mfem calls this "geometric factors")
- * @tparam Q a parameter controlling the number of quadrature points an element
- * @tparam function_space the polynomial order and kind of function space used to interpolate
- * @tparam geom the element geometry
- * @param positions_q (output) the positions for each quadrature point
- * @param jacobians_q (output) the jacobians for each quadrature point
- * @param positions_e (input) the "e-vector" of position data
- * @param elements (input) the list of element indices that are part of this domain
- */
-template <int Q, mfem::Geometry::Type geom, typename function_space>
-void compute_geometric_factors(mfem::Vector& positions_q, mfem::Vector& jacobians_q, const mfem::Vector& positions_e,
-                               const std::vector<int>& elements)
-{
-  static constexpr TensorProductQuadratureRule<Q> rule{};
-
-  constexpr int spatial_dim   = function_space::components;
-  constexpr int geometry_dim  = dimension_of(geom);
-  constexpr int qpts_per_elem = num_quadrature_points(geom, Q);
-
-  using element_type  = finite_element<geom, function_space>;
-  using position_type = tensor<double, spatial_dim, qpts_per_elem>;
-  using jacobian_type = tensor<double, geometry_dim, spatial_dim, qpts_per_elem>;
-
-  auto X_q = reinterpret_cast<position_type*>(positions_q.ReadWrite());
-  auto J_q = reinterpret_cast<jacobian_type*>(jacobians_q.ReadWrite());
-  auto X   = reinterpret_cast<const typename element_type::dof_type*>(positions_e.Read());
-
-  std::size_t num_elements = elements.size();
-
-  // for each element in the domain
-  for (uint32_t e = 0; e < num_elements; e++) {
-    // load the positions for the nodes in this element
-    auto X_e = X[elements[e]];
-
-    // calculate the values and derivatives (w.r.t. xi) of X at each quadrature point
-    auto quadrature_values = element_type::interpolate(X_e, rule);
-
-    // mfem wants to store this data in a different layout, so we have to transpose it
-    for (int q = 0; q < qpts_per_elem; q++) {
-      auto [value, gradient] = quadrature_values[q];
-      for (int i = 0; i < spatial_dim; i++) {
-        X_q[e](i, q) = value[i];
-        if constexpr (std::is_same_v<decltype(value), decltype(gradient)>) {
-          J_q[e](0, i, q) = gradient[i];
-        }
-        if constexpr (!std::is_same_v<decltype(value), decltype(gradient)>) {
-          for (int j = 0; j < geometry_dim; j++) {
-            J_q[e](j, i, q) = gradient(i, j);
-          }
-        }
-      }
-    }
-  }
-}
-
-GeometricFactors::GeometricFactors(const Domain& d, int q, mfem::Geometry::Type g)
-{
-  auto* nodes = d.mesh_.GetNodes();
-  auto* fes   = nodes->FESpace();
-
-  auto         restriction = serac::ElementRestriction(fes, g);
-  mfem::Vector X_e(int(restriction.ESize()));
-  restriction.Gather(*nodes, X_e);
-
-  // assumes all elements are the same order
-  int p = fes->GetElementOrder(0);
-
-  int spatial_dim   = d.mesh_.SpaceDimension();
-  int geometry_dim  = dimension_of(g);
-  int qpts_per_elem = num_quadrature_points(g, q);
-
-  if (g == mfem::Geometry::TRIANGLE) elements = d.tri_ids_;
-  if (g == mfem::Geometry::SQUARE) elements = d.quad_ids_;
-  if (g == mfem::Geometry::TETRAHEDRON) elements = d.tet_ids_;
-  if (g == mfem::Geometry::CUBE) elements = d.hex_ids_;
-
-  num_elements = elements.size();
-
-  X = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim);
-  J = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim * geometry_dim);
-
-#define DISPATCH_KERNEL(GEOM, P, Q)                                                                           \
-  if (g == mfem::Geometry::GEOM && p == P && q == Q) {                                                        \
-    compute_geometric_factors<Q, mfem::Geometry::GEOM, H1<P, dimension_of(mfem::Geometry::GEOM)> >(X, J, X_e, \
-                                                                                                   elements); \
-    return;                                                                                                   \
-  }
-
-  DISPATCH_KERNEL(TRIANGLE, 1, 1);
-  DISPATCH_KERNEL(TRIANGLE, 1, 2);
-  DISPATCH_KERNEL(TRIANGLE, 1, 3);
-  DISPATCH_KERNEL(TRIANGLE, 1, 4);
-
-  DISPATCH_KERNEL(SQUARE, 1, 1);
-  DISPATCH_KERNEL(SQUARE, 1, 2);
-  DISPATCH_KERNEL(SQUARE, 1, 3);
-  DISPATCH_KERNEL(SQUARE, 1, 4);
-
-  DISPATCH_KERNEL(SQUARE, 2, 1);
-  DISPATCH_KERNEL(SQUARE, 2, 2);
-  DISPATCH_KERNEL(SQUARE, 2, 3);
-  DISPATCH_KERNEL(SQUARE, 2, 4);
-
-  DISPATCH_KERNEL(SQUARE, 3, 1);
-  DISPATCH_KERNEL(SQUARE, 3, 2);
-  DISPATCH_KERNEL(SQUARE, 3, 3);
-  DISPATCH_KERNEL(SQUARE, 3, 4);
-
-  DISPATCH_KERNEL(TETRAHEDRON, 1, 1);
-  DISPATCH_KERNEL(TETRAHEDRON, 1, 2);
-  DISPATCH_KERNEL(TETRAHEDRON, 1, 3);
-  DISPATCH_KERNEL(TETRAHEDRON, 1, 4);
-
-  DISPATCH_KERNEL(CUBE, 1, 1);
-  DISPATCH_KERNEL(CUBE, 1, 2);
-  DISPATCH_KERNEL(CUBE, 1, 3);
-  DISPATCH_KERNEL(CUBE, 1, 4);
-
-  DISPATCH_KERNEL(CUBE, 2, 1);
-  DISPATCH_KERNEL(CUBE, 2, 2);
-  DISPATCH_KERNEL(CUBE, 2, 3);
-  DISPATCH_KERNEL(CUBE, 2, 4);
-
-  DISPATCH_KERNEL(CUBE, 3, 1);
-  DISPATCH_KERNEL(CUBE, 3, 2);
-  DISPATCH_KERNEL(CUBE, 3, 3);
-  DISPATCH_KERNEL(CUBE, 3, 4);
-
-#undef DISPATCH_KERNEL
-
-  std::cout << "should never be reached " << std::endl;
-}
-
-GeometricFactors::GeometricFactors(const Domain& d, int q, mfem::Geometry::Type g, FaceType type)
-{
-  auto* nodes = d.mesh_.GetNodes();
-  auto* fes   = nodes->FESpace();
-
-  auto         restriction = serac::ElementRestriction(fes, g, type);
-  mfem::Vector X_e(int(restriction.ESize()));
-  restriction.Gather(*nodes, X_e);
-
-  // assumes all elements are the same order
-  int p = fes->GetElementOrder(0);
-
-  int spatial_dim   = d.mesh_.SpaceDimension();
-  int geometry_dim  = dimension_of(g);
-  int qpts_per_elem = num_quadrature_points(g, q);
-
-  // NB: we only want the number of elements with the specified
-  // geometry, which is not the same as mesh->GetNE() in general
-  elements = d.get(g);
-
-  num_elements = std::size_t(elements.size());
-
-  X = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim);
-  J = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim * geometry_dim);
-
-#define DISPATCH_KERNEL(GEOM, P, Q)                                                                               \
-  if (g == mfem::Geometry::GEOM && p == P && q == Q) {                                                            \
-    compute_geometric_factors<Q, mfem::Geometry::GEOM, H1<P, dimension_of(mfem::Geometry::GEOM) + 1> >(X, J, X_e, \
-                                                                                                       elements); \
-    return;                                                                                                       \
-  }
-
-  DISPATCH_KERNEL(SEGMENT, 1, 1);
-  DISPATCH_KERNEL(SEGMENT, 1, 2);
-  DISPATCH_KERNEL(SEGMENT, 1, 3);
-  DISPATCH_KERNEL(SEGMENT, 1, 4);
-
-  DISPATCH_KERNEL(SEGMENT, 2, 1);
-  DISPATCH_KERNEL(SEGMENT, 2, 2);
-  DISPATCH_KERNEL(SEGMENT, 2, 3);
-  DISPATCH_KERNEL(SEGMENT, 2, 4);
-
-  DISPATCH_KERNEL(SEGMENT, 3, 1);
-  DISPATCH_KERNEL(SEGMENT, 3, 2);
-  DISPATCH_KERNEL(SEGMENT, 3, 3);
-  DISPATCH_KERNEL(SEGMENT, 3, 4);
-
-  DISPATCH_KERNEL(TRIANGLE, 1, 1);
-  DISPATCH_KERNEL(TRIANGLE, 1, 2);
-  DISPATCH_KERNEL(TRIANGLE, 1, 3);
-  DISPATCH_KERNEL(TRIANGLE, 1, 4);
-
-  DISPATCH_KERNEL(TRIANGLE, 2, 1);
-  DISPATCH_KERNEL(TRIANGLE, 2, 2);
-  DISPATCH_KERNEL(TRIANGLE, 2, 3);
-  DISPATCH_KERNEL(TRIANGLE, 2, 4);
-
-  DISPATCH_KERNEL(TRIANGLE, 3, 1);
-  DISPATCH_KERNEL(TRIANGLE, 3, 2);
-  DISPATCH_KERNEL(TRIANGLE, 3, 3);
-  DISPATCH_KERNEL(TRIANGLE, 3, 4);
-
-  DISPATCH_KERNEL(SQUARE, 1, 1);
-  DISPATCH_KERNEL(SQUARE, 1, 2);
-  DISPATCH_KERNEL(SQUARE, 1, 3);
-  DISPATCH_KERNEL(SQUARE, 1, 4);
-
-  DISPATCH_KERNEL(SQUARE, 2, 1);
-  DISPATCH_KERNEL(SQUARE, 2, 2);
-  DISPATCH_KERNEL(SQUARE, 2, 3);
-  DISPATCH_KERNEL(SQUARE, 2, 4);
-
-  DISPATCH_KERNEL(SQUARE, 3, 1);
-  DISPATCH_KERNEL(SQUARE, 3, 2);
-  DISPATCH_KERNEL(SQUARE, 3, 3);
-  DISPATCH_KERNEL(SQUARE, 3, 4);
-
-#undef DISPATCH_KERNEL
-
-  std::cout << "should never be reached" << std::endl;
-}
-
-}  // namespace serac
diff --git a/src/serac/numerics/functional/geometric_factors.hpp b/src/serac/numerics/functional/geometric_factors.hpp
index 5d33ab256..18c0704e7 100644
--- a/src/serac/numerics/functional/geometric_factors.hpp
+++ b/src/serac/numerics/functional/geometric_factors.hpp
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "serac/infrastructure/accelerator.hpp"
 #include "serac/numerics/functional/element_restriction.hpp"  // for FaceType
 #include "serac/numerics/functional/finite_element.hpp"       // for Geometry
 #include "serac/numerics/functional/domain.hpp"
@@ -7,12 +8,73 @@
 #include "mfem.hpp"
 
 namespace serac {
+/**
+ * @brief a kernel to compute the positions and jacobians at each quadrature point (mfem calls this "geometric factors")
+ * @tparam Q a parameter controlling the number of quadrature points an element
+ * @tparam function_space the polynomial order and kind of function space used to interpolate
+ * @tparam geom the element geometry
+ * @param positions_q (output) the positions for each quadrature point
+ * @param jacobians_q (output) the jacobians for each quadrature point
+ * @param positions_e (input) the "e-vector" of position data
+ * @param elements (input) the list of element indices that are part of this domain
+ */
+template <int Q, mfem::Geometry::Type geom, ExecutionSpace exec, typename function_space>
+void compute_geometric_factors(mfem::Vector& positions_q, mfem::Vector& jacobians_q, const mfem::Vector& positions_e,
+                               const std::vector<int>& elements)
+{
+  constexpr TensorProductQuadratureRule<Q> rule{};
+
+  constexpr int spatial_dim   = function_space::components;
+  constexpr int geometry_dim  = dimension_of(geom);
+  constexpr int qpts_per_elem = num_quadrature_points(geom, Q);
+
+  using element_type  = finite_element<geom, function_space, exec>;
+  using position_type = tensor<double, spatial_dim, qpts_per_elem>;
+  using jacobian_type = tensor<double, geometry_dim, spatial_dim, qpts_per_elem>;
+
+  auto X_q = reinterpret_cast<position_type*>(positions_q.HostReadWrite());
+  auto J_q = reinterpret_cast<jacobian_type*>(jacobians_q.HostReadWrite());
+  auto X   = reinterpret_cast<const typename element_type::dof_type*>(positions_e.HostRead());
+
+  std::size_t num_elements = elements.size();
+
+  // for each element in the domain
+  using qf_inputs_type              = decltype(element_type{}.template interpolate_output_helper<Q>());
+  auto&           rm                = umpire::ResourceManager::getInstance();
+  auto            host_allocator    = rm.getAllocator("HOST");
+  qf_inputs_type* quadrature_values = static_cast<qf_inputs_type*>(host_allocator.allocate(sizeof(qf_inputs_type)));
+
+  for (uint32_t e = 0; e < num_elements; ++e) {
+    // load the positions for the nodes in this element
+    auto X_e = X[elements[e]];
+    // calculate the values and derivatives (w.r.t. xi) of X at each quadrature point
+    element_type::interpolate(X_e, rule, quadrature_values, RAJA::LaunchContext{});
+
+    // mfem wants to store this data in a different layout, so we have to transpose it
+    for (int q = 0; q < qpts_per_elem; q++) {
+      auto [value, gradient] = (*quadrature_values)[q];
+      for (int i = 0; i < spatial_dim; i++) {
+        X_q[e](i, q) = value[i];
+        if constexpr (std::is_same_v<decltype(value), decltype(gradient)>) {
+          J_q[e](0, i, q) = gradient[i];
+        }
+        if constexpr (!std::is_same_v<decltype(value), decltype(gradient)>) {
+          for (int j = 0; j < geometry_dim; j++) {
+            J_q[e](j, i, q) = gradient(i, j);
+          }
+        }
+      }
+    }
+  }
+  host_allocator.deallocate(quadrature_values);
+}
 
 /**
  * @brief a class that computes and stores positions and jacobians at each quadrature point
  * @note analogous to mfem::GeometricFactors, except that it implements the position/jacobian
  *       calculations on boundary elements and on simplex elements
  */
+template <ExecutionSpace exec = ExecutionSpace::CPU>
 struct GeometricFactors {
   /// @brief default ctor, leaving this object uninitialized
   GeometricFactors(){};
@@ -21,22 +83,178 @@ struct GeometricFactors {
    * @brief calculate positions and jacobians for quadrature points belonging to
    * elements with the specified geometry, belonging to the provided mesh.
    *
-   * @param domain the domain of integration
+   * @param d the domain of integration
    * @param q a parameter controlling the number of quadrature points per element
-   * @param elem_geom which kind of element geometry to select
+   * @param g which kind of element geometry to select
    */
-  GeometricFactors(const Domain& domain, int q, mfem::Geometry::Type elem_geom);
+  GeometricFactors(const Domain& d, int q, mfem::Geometry::Type g)
+  {
+    auto* nodes = d.mesh_.GetNodes();
+    auto* fes   = nodes->FESpace();
+
+    auto         restriction = serac::ElementRestriction(fes, g);
+    mfem::Vector X_e(int(restriction.ESize()));
+    restriction.template Gather<exec>(*nodes, X_e);
+
+    // assumes all elements are the same order
+    int p = fes->GetElementOrder(0);
+
+    int spatial_dim   = d.mesh_.SpaceDimension();
+    int geometry_dim  = dimension_of(g);
+    int qpts_per_elem = num_quadrature_points(g, q);
+
+    if (g == mfem::Geometry::TRIANGLE) elements = d.tri_ids_;
+    if (g == mfem::Geometry::SQUARE) elements = d.quad_ids_;
+    if (g == mfem::Geometry::TETRAHEDRON) elements = d.tet_ids_;
+    if (g == mfem::Geometry::CUBE) elements = d.hex_ids_;
+
+    num_elements = elements.size();
+
+    X = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim);
+    J = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim * geometry_dim);
+
+#define DISPATCH_KERNEL(GEOM, P, Q)                                                            \
+  if (g == mfem::Geometry::GEOM && p == P && q == Q) {                                         \
+    compute_geometric_factors<Q, mfem::Geometry::GEOM, ExecutionSpace::CPU,                    \
+                              H1<P, dimension_of(mfem::Geometry::GEOM)>>(X, J, X_e, elements); \
+    return;                                                                                    \
+  }
+
+    DISPATCH_KERNEL(TRIANGLE, 1, 1);
+    DISPATCH_KERNEL(TRIANGLE, 1, 2);
+    DISPATCH_KERNEL(TRIANGLE, 1, 3);
+    DISPATCH_KERNEL(TRIANGLE, 1, 4);
+
+    DISPATCH_KERNEL(SQUARE, 1, 1);
+    DISPATCH_KERNEL(SQUARE, 1, 2);
+    DISPATCH_KERNEL(SQUARE, 1, 3);
+    DISPATCH_KERNEL(SQUARE, 1, 4);
+
+    DISPATCH_KERNEL(SQUARE, 2, 1);
+    DISPATCH_KERNEL(SQUARE, 2, 2);
+    DISPATCH_KERNEL(SQUARE, 2, 3);
+    DISPATCH_KERNEL(SQUARE, 2, 4);
+
+    DISPATCH_KERNEL(SQUARE, 3, 1);
+    DISPATCH_KERNEL(SQUARE, 3, 2);
+    DISPATCH_KERNEL(SQUARE, 3, 3);
+    DISPATCH_KERNEL(SQUARE, 3, 4);
+
+    DISPATCH_KERNEL(TETRAHEDRON, 1, 1);
+    DISPATCH_KERNEL(TETRAHEDRON, 1, 2);
+    DISPATCH_KERNEL(TETRAHEDRON, 1, 3);
+    DISPATCH_KERNEL(TETRAHEDRON, 1, 4);
+
+    DISPATCH_KERNEL(CUBE, 1, 1);
+    DISPATCH_KERNEL(CUBE, 1, 2);
+    DISPATCH_KERNEL(CUBE, 1, 3);
+    DISPATCH_KERNEL(CUBE, 1, 4);
+
+    DISPATCH_KERNEL(CUBE, 2, 1);
+    DISPATCH_KERNEL(CUBE, 2, 2);
+    DISPATCH_KERNEL(CUBE, 2, 3);
+    DISPATCH_KERNEL(CUBE, 2, 4);
+
+    DISPATCH_KERNEL(CUBE, 3, 1);
+    DISPATCH_KERNEL(CUBE, 3, 2);
+    DISPATCH_KERNEL(CUBE, 3, 3);
+    DISPATCH_KERNEL(CUBE, 3, 4);
+
+#undef DISPATCH_KERNEL
+
+    std::cout << "should never be reached " << std::endl;
+  }
 
   /**
    * @brief calculate positions and jacobians for quadrature points belonging to
    * boundary elements with the specified geometry, belonging to the provided mesh.
    *
-   * @param domain the domain of integration
+   * @param d the domain of integration
    * @param q a parameter controlling the number of quadrature points per element
-   * @param elem_geom which kind of element geometry to select
+   * @param g which kind of element geometry to select
    * @param type whether or not the faces are on the boundary (supported) or interior (unsupported)
    */
-  GeometricFactors(const Domain& domain, int q, mfem::Geometry::Type elem_geom, FaceType type);
+  GeometricFactors(const Domain& d, int q, mfem::Geometry::Type g, FaceType type)
+  {
+    auto* nodes = d.mesh_.GetNodes();
+    auto* fes   = nodes->FESpace();
+
+    auto         restriction = serac::ElementRestriction(fes, g, type);
+    mfem::Vector X_e(int(restriction.ESize()));
+    restriction.template Gather<exec>(*nodes, X_e);
+
+    // assumes all elements are the same order
+    int p = fes->GetElementOrder(0);
+
+    int spatial_dim   = d.mesh_.SpaceDimension();
+    int geometry_dim  = dimension_of(g);
+    int qpts_per_elem = num_quadrature_points(g, q);
+
+    // NB: we only want the number of elements with the specified
+    // geometry, which is not the same as mesh->GetNE() in general
+    elements = d.get(g);
+
+    num_elements = std::size_t(elements.size());
+
+    X = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim);
+    J = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim * geometry_dim);
+
+#define DISPATCH_KERNEL(GEOM, P, Q)                                                                                    \
+  if (g == mfem::Geometry::GEOM && p == P && q == Q) {                                                                 \
+    compute_geometric_factors<Q, mfem::Geometry::GEOM, exec, H1<P, dimension_of(mfem::Geometry::GEOM) + 1>>(X, J, X_e, \
+                                                                                                            elements); \
+    return;                                                                                                            \
+  }
+
+    DISPATCH_KERNEL(SEGMENT, 1, 1);
+    DISPATCH_KERNEL(SEGMENT, 1, 2);
+    DISPATCH_KERNEL(SEGMENT, 1, 3);
+    DISPATCH_KERNEL(SEGMENT, 1, 4);
+
+    DISPATCH_KERNEL(SEGMENT, 2, 1);
+    DISPATCH_KERNEL(SEGMENT, 2, 2);
+    DISPATCH_KERNEL(SEGMENT, 2, 3);
+    DISPATCH_KERNEL(SEGMENT, 2, 4);
+
+    DISPATCH_KERNEL(SEGMENT, 3, 1);
+    DISPATCH_KERNEL(SEGMENT, 3, 2);
+    DISPATCH_KERNEL(SEGMENT, 3, 3);
+    DISPATCH_KERNEL(SEGMENT, 3, 4);
+
+    DISPATCH_KERNEL(TRIANGLE, 1, 1);
+    DISPATCH_KERNEL(TRIANGLE, 1, 2);
+    DISPATCH_KERNEL(TRIANGLE, 1, 3);
+    DISPATCH_KERNEL(TRIANGLE, 1, 4);
+
+    DISPATCH_KERNEL(TRIANGLE, 2, 1);
+    DISPATCH_KERNEL(TRIANGLE, 2, 2);
+    DISPATCH_KERNEL(TRIANGLE, 2, 3);
+    DISPATCH_KERNEL(TRIANGLE, 2, 4);
+
+    DISPATCH_KERNEL(TRIANGLE, 3, 1);
+    DISPATCH_KERNEL(TRIANGLE, 3, 2);
+    DISPATCH_KERNEL(TRIANGLE, 3, 3);
+    DISPATCH_KERNEL(TRIANGLE, 3, 4);
+
+    DISPATCH_KERNEL(SQUARE, 1, 1);
+    DISPATCH_KERNEL(SQUARE, 1, 2);
+    DISPATCH_KERNEL(SQUARE, 1, 3);
+    DISPATCH_KERNEL(SQUARE, 1, 4);
+
+    DISPATCH_KERNEL(SQUARE, 2, 1);
+    DISPATCH_KERNEL(SQUARE, 2, 2);
+    DISPATCH_KERNEL(SQUARE, 2, 3);
+    DISPATCH_KERNEL(SQUARE, 2, 4);
+
+    DISPATCH_KERNEL(SQUARE, 3, 1);
+    DISPATCH_KERNEL(SQUARE, 3, 2);
+    DISPATCH_KERNEL(SQUARE, 3, 3);
+    DISPATCH_KERNEL(SQUARE, 3, 4);
+
+#undef DISPATCH_KERNEL
+
+    std::cout << "should never be reached" << std::endl;
+  }
 
   // descriptions copied from mfem
 
diff --git a/src/serac/numerics/functional/integral.hpp b/src/serac/numerics/functional/integral.hpp
index 337118a7d..88d6f95ba 100644
--- a/src/serac/numerics/functional/integral.hpp
+++ b/src/serac/numerics/functional/integral.hpp
@@ -11,6 +11,7 @@
 
 #include "mfem.hpp"
 
+#include "serac/serac_config.hpp"
 #include "serac/infrastructure/accelerator.hpp"
 #include "serac/numerics/functional/geometric_factors.hpp"
 #include "serac/numerics/functional/function_signature.hpp"
@@ -21,6 +22,7 @@
 namespace serac {
 
 /// @brief a class for representing a Integral calculations and their derivatives
+template <ExecutionSpace exec>
 struct Integral {
   /// @brief the number of different kinds of integration domains
   static constexpr std::size_t num_types = 2;
@@ -76,6 +78,7 @@ struct Integral {
       for (std::size_t i = 0; i < active_trial_spaces_.size(); i++) {
         inputs[i] = input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry).Read();
       }
+      output_E.GetBlock(geometry) = 0.0;
       func(t, inputs, output_E.GetBlock(geometry).ReadWrite(), update_state);
     }
   }
@@ -97,6 +100,7 @@ struct Integral {
     // if this integral actually depends on the specified variable
     if (functional_to_integral_index_.count(differentiation_index) > 0) {
       for (auto& [geometry, func] : jvp_[functional_to_integral_index_.at(differentiation_index)]) {
+        output_E.GetBlock(geometry) = 0.0;
         func(input_E.GetBlock(geometry).Read(), output_E.GetBlock(geometry).ReadWrite());
       }
     }
@@ -109,8 +113,8 @@ struct Integral {
    * test_dofs_per_elem)
    * @param differentiation_index the index of the trial space being differentiated
    */
-  void ComputeElementGradients(std::map<mfem::Geometry::Type, ExecArray<double, 3, ExecutionSpace::CPU> >& K_e,
-                               uint32_t differentiation_index) const
+  void ComputeElementGradients(std::map<mfem::Geometry::Type, ExecArray<double, 3, exec>>& K_e,
+                               uint32_t                                                    differentiation_index) const
   {
     // if this integral actually depends on the specified variable
     if (functional_to_integral_index_.count(differentiation_index) > 0) {
@@ -130,19 +134,19 @@ struct Integral {
   std::map<mfem::Geometry::Type, eval_func> evaluation_;
 
   /// @brief kernels for integral evaluation + derivative w.r.t. specified argument over each type of element
-  std::vector<std::map<mfem::Geometry::Type, eval_func> > evaluation_with_AD_;
+  std::vector<std::map<mfem::Geometry::Type, eval_func>> evaluation_with_AD_;
 
   /// @brief signature of element jvp kernel
   using jacobian_vector_product_func = std::function<void(const double*, double*)>;
 
   /// @brief kernels for jacobian-vector product of integral calculation
-  std::vector<std::map<mfem::Geometry::Type, jacobian_vector_product_func> > jvp_;
+  std::vector<std::map<mfem::Geometry::Type, jacobian_vector_product_func>> jvp_;
 
   /// @brief signature of element gradient kernel
-  using grad_func = std::function<void(ExecArrayView<double, 3, ExecutionSpace::CPU>)>;
+  using grad_func = std::function<void(ExecArrayView<double, 3, exec>)>;
 
   /// @brief kernels for calculation of element jacobians
-  std::vector<std::map<mfem::Geometry::Type, grad_func> > element_gradient_;
+  std::vector<std::map<mfem::Geometry::Type, grad_func>> element_gradient_;
 
   /// @brief a list of the trial spaces that take part in this integrand
   std::vector<uint32_t> active_trial_spaces_;
@@ -164,7 +168,7 @@ struct Integral {
   std::map<uint32_t, uint32_t> functional_to_integral_index_;
 
   /// @brief the spatial positions and jacobians (dx_dxi) for each element type and quadrature point
-  std::map<mfem::Geometry::Type, GeometricFactors> geometric_factors_;
+  std::map<mfem::Geometry::Type, GeometricFactors<exec>> geometric_factors_;
 };
 
 /**
@@ -181,23 +185,32 @@ struct Integral {
  * @param qf the quadrature function
  * @param qdata the values of any quadrature point data for the material
  */
-template <mfem::Geometry::Type geom, int Q, typename test, typename... trials, typename lambda_type,
-          typename qpt_data_type>
-void generate_kernels(FunctionSignature<test(trials...)> s, Integral& integral, const lambda_type& qf,
-                      std::shared_ptr<QuadratureData<qpt_data_type> > qdata)
+template <mfem::Geometry::Type geom, int Q, ExecutionSpace exec, typename test, typename... trials,
+          typename lambda_type, typename qpt_data_type>
+void generate_kernels(FunctionSignature<test(trials...)> s, Integral<exec>& integral, const lambda_type& qf,
+                      std::shared_ptr<QuadratureData<qpt_data_type>> qdata)
 {
-  integral.geometric_factors_[geom] = GeometricFactors(integral.domain_, Q, geom);
-  GeometricFactors& gf              = integral.geometric_factors_[geom];
+  integral.geometric_factors_[geom] = GeometricFactors<exec>(integral.domain_, Q, geom);
+  GeometricFactors<exec>& gf        = integral.geometric_factors_[geom];
   if (gf.num_elements == 0) return;
 
-  const double*  positions        = gf.X.Read();
-  const double*  jacobians        = gf.J.Read();
-  const int*     elements         = &integral.domain_.get(geom)[0];
+  gf.X.UseDevice(true);
+  gf.J.UseDevice(true);
+  const double*  positions        = gf.X.Read(true);
+  const double*  jacobians        = gf.J.Read(true);
   const uint32_t num_elements     = uint32_t(gf.num_elements);
   const uint32_t qpts_per_element = num_quadrature_points(geom, Q);
 
   std::shared_ptr<zero> dummy_derivatives;
-  integral.evaluation_[geom] = domain_integral::evaluation_kernel<NO_DIFFERENTIATION, Q, geom>(
+  // Copy elements to device if using CUDA
+  const int* elements = nullptr;
+  if constexpr (exec == ExecutionSpace::GPU) {
+    elements = copy_data(const_cast<int*>(integral.domain_.get(geom).data()),
+                         sizeof(int) * integral.domain_.get(geom).size(), "DEVICE");
+  } else {
+    elements = integral.domain_.get(geom).data();
+  }
+  integral.evaluation_[geom] = domain_integral::evaluation_kernel<NO_DIFFERENTIATION, Q, geom, exec>(
       s, qf, positions, jacobians, qdata, dummy_derivatives, elements, num_elements);
 
   constexpr std::size_t                 num_args = s.num_args;
@@ -209,15 +222,16 @@ void generate_kernels(FunctionSignature<test(trials...)> s, Integral& integral,
     // action_of_gradient functor below to augment the reference count, and extend its lifetime to match
     // that of the DomainIntegral that allocated it.
     using derivative_type = decltype(domain_integral::get_derivative_type<index, dim, trials...>(qf, qpt_data_type{}));
-    auto ptr = accelerator::make_shared_array<ExecutionSpace::CPU, derivative_type>(num_elements * qpts_per_element);
 
-    integral.evaluation_with_AD_[index][geom] = domain_integral::evaluation_kernel<index, Q, geom>(
+    auto ptr = accelerator::make_shared_array<exec, derivative_type>(num_elements * qpts_per_element);
+
+    integral.evaluation_with_AD_[index][geom] = domain_integral::evaluation_kernel<index, Q, geom, exec>(
         s, qf, positions, jacobians, qdata, ptr, elements, num_elements);
 
     integral.jvp_[index][geom] =
-        domain_integral::jacobian_vector_product_kernel<index, Q, geom>(s, ptr, elements, num_elements);
+        domain_integral::jacobian_vector_product_kernel<index, Q, geom, exec>(s, ptr, elements, num_elements);
     integral.element_gradient_[index][geom] =
-        domain_integral::element_gradient_kernel<index, Q, geom>(s, ptr, elements, num_elements);
+        domain_integral::element_gradient_kernel<index, Q, geom, exec>(s, ptr, elements, num_elements);
   });
 }
 
@@ -235,25 +249,25 @@ void generate_kernels(FunctionSignature<test(trials...)> s, Integral& integral,
  * @param argument_indices the indices of trial space arguments used in the Integral
  * @return Integral the initialized `Integral` object
  */
-template <typename s, int Q, int dim, typename lambda_type, typename qpt_data_type>
-Integral MakeDomainIntegral(const Domain& domain, const lambda_type& qf,
-                            std::shared_ptr<QuadratureData<qpt_data_type> > qdata,
-                            std::vector<uint32_t>                           argument_indices)
+template <typename s, int Q, int dim, ExecutionSpace exec, typename lambda_type, typename qpt_data_type>
+Integral<exec> MakeDomainIntegral(const Domain& domain, const lambda_type& qf,
+                                  std::shared_ptr<QuadratureData<qpt_data_type>> qdata,
+                                  std::vector<uint32_t>                          argument_indices)
 {
   FunctionSignature<s> signature;
 
   SLIC_ERROR_IF(domain.type_ != Domain::Type::Elements, "Error: trying to evaluate a domain integral over a boundary");
 
-  Integral integral(domain, argument_indices);
+  Integral<exec> integral(domain, argument_indices);
 
   if constexpr (dim == 2) {
-    generate_kernels<mfem::Geometry::TRIANGLE, Q>(signature, integral, qf, qdata);
-    generate_kernels<mfem::Geometry::SQUARE, Q>(signature, integral, qf, qdata);
+    generate_kernels<mfem::Geometry::TRIANGLE, Q, exec>(signature, integral, qf, qdata);
+    generate_kernels<mfem::Geometry::SQUARE, Q, exec>(signature, integral, qf, qdata);
   }
 
   if constexpr (dim == 3) {
-    generate_kernels<mfem::Geometry::TETRAHEDRON, Q>(signature, integral, qf, qdata);
-    generate_kernels<mfem::Geometry::CUBE, Q>(signature, integral, qf, qdata);
+    generate_kernels<mfem::Geometry::TETRAHEDRON, Q, exec>(signature, integral, qf, qdata);
+    generate_kernels<mfem::Geometry::CUBE, Q, exec>(signature, integral, qf, qdata);
   }
 
   return integral;
@@ -272,21 +286,28 @@ Integral MakeDomainIntegral(const Domain& domain, const lambda_type& qf,
  * @param integral the Integral object to initialize
  * @param qf the quadrature function
  */
-template <mfem::Geometry::Type geom, int Q, typename test, typename... trials, typename lambda_type>
-void generate_bdr_kernels(FunctionSignature<test(trials...)> s, Integral& integral, const lambda_type& qf)
+template <mfem::Geometry::Type geom, int Q, ExecutionSpace exec, typename test, typename... trials,
+          typename lambda_type>
+void generate_bdr_kernels(FunctionSignature<test(trials...)> s, Integral<exec>& integral, const lambda_type& qf)
 {
-  integral.geometric_factors_[geom] = GeometricFactors(integral.domain_, Q, geom, FaceType::BOUNDARY);
-  GeometricFactors& gf              = integral.geometric_factors_[geom];
+  integral.geometric_factors_[geom] = GeometricFactors<exec>(integral.domain_, Q, geom, FaceType::BOUNDARY);
+  GeometricFactors<exec>& gf        = integral.geometric_factors_[geom];
   if (gf.num_elements == 0) return;
 
-  const double*  positions        = gf.X.Read();
-  const double*  jacobians        = gf.J.Read();
+  const double*  positions        = gf.X.Read(true);
+  const double*  jacobians        = gf.J.Read(true);
   const uint32_t num_elements     = uint32_t(gf.num_elements);
   const uint32_t qpts_per_element = num_quadrature_points(geom, Q);
-  const int*     elements         = &gf.elements[0];
+
+  int* elements = nullptr;
+  if constexpr (exec == ExecutionSpace::GPU) {
+    elements = copy_data(const_cast<int*>(&gf.elements[0]), sizeof(int) * integral.domain_.get(geom).size(), "DEVICE");
+  } else {
+    elements = &gf.elements[0];
+  }
 
   std::shared_ptr<zero> dummy_derivatives;
-  integral.evaluation_[geom] = boundary_integral::evaluation_kernel<NO_DIFFERENTIATION, Q, geom>(
+  integral.evaluation_[geom] = boundary_integral::evaluation_kernel<NO_DIFFERENTIATION, Q, geom, exec>(
       s, qf, positions, jacobians, dummy_derivatives, elements, num_elements);
 
   constexpr std::size_t                 num_args = s.num_args;
@@ -298,15 +319,15 @@ void generate_bdr_kernels(FunctionSignature<test(trials...)> s, Integral& integr
     // action_of_gradient functor below to augment the reference count, and extend its lifetime to match
     // that of the boundaryIntegral that allocated it.
     using derivative_type = decltype(boundary_integral::get_derivative_type<index, dim, trials...>(qf));
-    auto ptr = accelerator::make_shared_array<ExecutionSpace::CPU, derivative_type>(num_elements * qpts_per_element);
+    auto ptr              = accelerator::make_shared_array<exec, derivative_type>(num_elements * qpts_per_element);
 
-    integral.evaluation_with_AD_[index][geom] =
-        boundary_integral::evaluation_kernel<index, Q, geom>(s, qf, positions, jacobians, ptr, elements, num_elements);
+    integral.evaluation_with_AD_[index][geom] = boundary_integral::evaluation_kernel<index, Q, geom, exec>(
+        s, qf, positions, jacobians, ptr, elements, num_elements);
 
     integral.jvp_[index][geom] =
-        boundary_integral::jacobian_vector_product_kernel<index, Q, geom>(s, ptr, elements, num_elements);
+        boundary_integral::jacobian_vector_product_kernel<index, Q, geom, exec>(s, ptr, elements, num_elements);
     integral.element_gradient_[index][geom] =
-        boundary_integral::element_gradient_kernel<index, Q, geom>(s, ptr, elements, num_elements);
+        boundary_integral::element_gradient_kernel<index, Q, geom, exec>(s, ptr, elements, num_elements);
   });
 }
 
@@ -324,23 +345,23 @@ void generate_bdr_kernels(FunctionSignature<test(trials...)> s, Integral& integr
  *
  * @note this function is not meant to be called by users
  */
-template <typename s, int Q, int dim, typename lambda_type>
-Integral MakeBoundaryIntegral(const Domain& domain, const lambda_type& qf, std::vector<uint32_t> argument_indices)
+template <typename s, int Q, int dim, ExecutionSpace exec, typename lambda_type>
+Integral<exec> MakeBoundaryIntegral(const Domain& domain, const lambda_type& qf, std::vector<uint32_t> argument_indices)
 {
   FunctionSignature<s> signature;
 
   SLIC_ERROR_IF(domain.type_ != Domain::Type::BoundaryElements,
                 "Error: trying to evaluate a boundary integral over a non-boundary domain of integration");
 
-  Integral integral(domain, argument_indices);
+  Integral<exec> integral(domain, argument_indices);
 
   if constexpr (dim == 1) {
-    generate_bdr_kernels<mfem::Geometry::SEGMENT, Q>(signature, integral, qf);
+    generate_bdr_kernels<mfem::Geometry::SEGMENT, Q, exec>(signature, integral, qf);
   }
 
   if constexpr (dim == 2) {
-    generate_bdr_kernels<mfem::Geometry::TRIANGLE, Q>(signature, integral, qf);
-    generate_bdr_kernels<mfem::Geometry::SQUARE, Q>(signature, integral, qf);
+    generate_bdr_kernels<mfem::Geometry::TRIANGLE, Q, exec>(signature, integral, qf);
+    generate_bdr_kernels<mfem::Geometry::SQUARE, Q, exec>(signature, integral, qf);
   }
 
   return integral;
diff --git a/src/serac/numerics/functional/quadrature_data.hpp b/src/serac/numerics/functional/quadrature_data.hpp
index b3ace3c4e..570874b86 100644
--- a/src/serac/numerics/functional/quadrature_data.hpp
+++ b/src/serac/numerics/functional/quadrature_data.hpp
@@ -40,7 +40,17 @@ struct Nothing {};
 /**
  * @brief see `Nothing` for a complete description of this class and when to use it
  */
-struct Empty {};
+struct Empty {
+  /**
+   * @brief operator= overload
+   * @tparam T anything can be set to Empty
+   */
+  template <typename T>
+  SERAC_HOST_DEVICE auto operator=(T) const
+  {
+    return Empty{};
+  }
+};
 
 template <typename T>
 struct QuadratureData;
diff --git a/src/serac/numerics/functional/tensor.hpp b/src/serac/numerics/functional/tensor.hpp
index e215e486e..1b769bc46 100644
--- a/src/serac/numerics/functional/tensor.hpp
+++ b/src/serac/numerics/functional/tensor.hpp
@@ -12,6 +12,24 @@
 
 #pragma once
 
+/**
+ * @brief Macro defining block size for CUDA/HIP targets
+ */
+#define BLOCK_SZ 128
+/**
+ * @brief Macro defining block dimension for CUDA/HIP targets
+ */
+#define BLOCK_X 8
+/**
+ * @brief Macro defining block dimension for CUDA/HIP targets
+ */
+#define BLOCK_Y 4
+/**
+ * @brief Macro defining block dimension for CUDA/HIP targets
+ */
+#define BLOCK_Z 4
+
+#include "serac/serac_config.hpp"
 #include "serac/infrastructure/accelerator.hpp"
 
 #include "detail/metaprogramming.hpp"
@@ -708,6 +726,7 @@ template <typename S, typename T, int m, int n, int p>
 SERAC_HOST_DEVICE constexpr auto dot(const tensor<S, m, n>& A, const tensor<T, n, p>& B)
 {
   tensor<decltype(S{} * T{}), m, p> AB{};
+
   for (int i = 0; i < m; i++) {
     for (int j = 0; j < p; j++) {
       for (int k = 0; k < n; k++) {
@@ -903,7 +922,7 @@ SERAC_HOST_DEVICE constexpr auto dot(const tensor<S, m, n, p, q>& A, const tenso
 
 /// compute the cross product of the columns of A: A(:,1) x A(:,2)
 template <typename T>
-auto cross(const tensor<T, 3, 2>& A)
+auto SERAC_HOST_DEVICE cross(const tensor<T, 3, 2>& A)
 {
   return tensor<T, 3>{A(1, 0) * A(2, 1) - A(2, 0) * A(1, 1), A(2, 0) * A(0, 1) - A(0, 0) * A(2, 1),
                       A(0, 0) * A(1, 1) - A(1, 0) * A(0, 1)};
@@ -911,21 +930,21 @@ auto cross(const tensor<T, 3, 2>& A)
 
 /// return the in-plane components of the cross product of {v[0], v[1], 0} x {0, 0, 1}
 template <typename T>
-auto cross(const tensor<T, 2, 1>& v)
+auto SERAC_HOST_DEVICE cross(const tensor<T, 2, 1>& v)
 {
   return tensor<T, 2>{v(1, 0), -v(0, 0)};
 }
 
 /// return the in-plane components of the cross product of {v[0], v[1], 0} x {0, 0, 1}
 template <typename T>
-auto cross(const tensor<T, 2>& v)
+auto SERAC_HOST_DEVICE cross(const tensor<T, 2>& v)
 {
   return tensor<T, 2>{v[1], -v[0]};
 }
 
 /// compute the (right handed) cross product of two 3-vectors
 template <typename S, typename T>
-auto cross(const tensor<S, 3>& u, const tensor<T, 3>& v)
+auto SERAC_HOST_DEVICE cross(const tensor<S, 3>& u, const tensor<T, 3>& v)
 {
   return tensor<decltype(S{} * T{}), 3>{u(1) * v(2) - u(2) * v(1), u(2) * v(0) - u(0) * v(2),
                                         u(0) * v(1) - u(1) * v(0)};
@@ -1282,7 +1301,7 @@ SERAC_HOST_DEVICE constexpr auto detApIm1(const tensor<T, 3, 3>& A)
 
   // clang-format off
   // equivalent to tr(A) + I2(A) + det(A)
-  return A(0, 0) + A(1, 1) + A(2, 2) 
+  return A(0, 0) + A(1, 1) + A(2, 2)
        - A(0, 1) * A(1, 0) * (1 + A(2, 2))
        + A(0, 0) * A(1, 1) * (1 + A(2, 2))
        - A(0, 2) * A(2, 0) * (1 + A(1, 1))
@@ -1358,12 +1377,8 @@ SERAC_HOST_DEVICE auto contract(const tensor<S, m, n...>& A, const tensor<T, p,
   // the type of the output tensor is easier to figure out
   using U = decltype(S{} * T{});
 
-  auto C = []() {
-    if constexpr (d3 == 0) return tensor<U, d1, d2>{};
-    if constexpr (d3 != 0) return tensor<U, d1, d2, d3>{};
-  }();
-
   if constexpr (d3 == 0) {
+    auto C = tensor<U, d1, d2>{};
     for (int i = 0; i < d1; i++) {
       for (int j = 0; j < d2; j++) {
         U sum{};
@@ -1376,7 +1391,11 @@ SERAC_HOST_DEVICE auto contract(const tensor<S, m, n...>& A, const tensor<T, p,
         C(i, j) = sum;
       }
     }
-  } else {
+
+    return C;
+  }
+  if constexpr (d3 != 0) {
+    auto C = tensor<U, d1, d2, d3>{};
     for (int i = 0; i < d1; i++) {
       for (int j = 0; j < d2; j++) {
         for (int k = 0; k < d3; k++) {
@@ -1393,9 +1412,211 @@ SERAC_HOST_DEVICE auto contract(const tensor<S, m, n...>& A, const tensor<T, p,
         }
       }
     }
+
+    return C;
   }
+}
 
-  return C;
+/**
+ * @brief This is a helper function for memory allocations and declarations in GPU implementations
+ * of serac.  For example, it can be used in declarations like
+ *   RAJA_TEAM_SHARED decltype(deduce_contract_return_type(A, B)) shared memory;
+ * or together with umpire in allocations.
+ */
+template <int i1, int i2, typename S, int m, typename T, int p, int q, int... n>
+SERAC_HOST_DEVICE constexpr auto deduce_contract_return_type(const tensor<S, m, n...>&, const tensor<T, p, q>&)
+{
+  constexpr int Adims[] = {m, n...};
+  constexpr int Bdims[] = {p, q};
+  static_assert(sizeof...(n) < 3);
+  static_assert(Adims[i1] == Bdims[i2], "error: incompatible tensor dimensions");
+
+  // first, we have to figure out the dimensions of the output tensor
+  constexpr int new_dim = (i2 == 0) ? q : p;
+  constexpr int d1      = (i1 == 0) ? new_dim : Adims[0];
+  constexpr int d2      = (i1 == 1) ? new_dim : Adims[1];
+  constexpr int d3      = sizeof...(n) == 1 ? 0 : ((i1 == 2) ? new_dim : Adims[2]);
+  using U               = decltype(S{} * T{});
+  if constexpr (d3 == 0) {
+    return serac::tensor<U, d1, d2>{};
+  }
+  return tensor<U, d1, d2, d3>{};
+}
+
+/// @overload
+template <int i1, int i2, typename T, int p, int q>
+SERAC_HOST_DEVICE constexpr auto deduce_contract_return_type(const zero&, const tensor<T, p, q>&)
+{
+  return zero{};
+}
+
+// TODO(bowen): Figure out using decltype vs helper structs for shared memory buffers
+/* template<int i1, int i2, typename... T>
+ class contract_type {
+
+};
+
+ template <int i1, int i2, typename S, int m, typename T, int p, int q, int... n>
+ class contract_type<i1, i2, tensor<S, m, n...>,  tensor<T, p, q>> {
+ SERAC_HOST_DEVICE contract_type(const tensor<S, m, n...>&, const tensor<T, p, q>&) {}
+
+   static constexpr int Adims[] = {m, n...};
+   static constexpr int Bdims[] = {p, q};
+   static_assert(sizeof...(n) < 3);
+   static_assert(Adims[i1] == Bdims[i2], "error: incompatible tensor dimensions");
+
+   // first, we have to figure out the dimensions of the output tensor
+   static constexpr int new_dim = (i2 == 0) ? q : p;
+   static constexpr int d1      = (i1 == 0) ? new_dim : Adims[0];
+   static constexpr int d2      = (i1 == 1) ? new_dim : Adims[1];
+   static constexpr int d3      = sizeof...(n) == 1 ? 0 : ((i1 == 2) ? new_dim : Adims[2]);
+   using U               = decltype(S{} * T{});
+   // if constexpr (d3 == 0) {
+   //   return serac::tensor<U, d1, d2>{};
+   // }
+ public:
+   using type = typename std::conditional<d3 == 0, tensor<U, d1, d2>, tensor<U, d1, d2, d3>>::type;
+ };
+
+ template <int i1, int i2, typename S, int m, typename T, int p, int q, int... n>
+ contract_type(tensor<S, m, n...>,  tensor<T, p, q>) -> contract_type<1, 1, tensor<S, m, n...>,  tensor<T, p, q>>;
+
+ template<int i1, int i2, typename S>
+ struct contract_type<i1, i2, zero, S> {
+   SERAC_HOST_DEVICE contract_type(const zero&, const S&) {}
+   using type = zero;
+ };
+
+ template<int i1, int i2, typename S>
+ struct contract_type<i1, i2, S, zero> {
+   SERAC_HOST_DEVICE contract_type(const S&, const zero&) {}
+   using type = zero;
+ };
+ */
+
+/**
+ * @brief When deducing the result of a sum of contractions, e.g. the type
+ *   using type = decltype(contract(A, B) + contract(C, D))
+ *   it is necessary to determine if both contractions result in a zero tensor.
+ *   This simple struct returns has
+ *          type = zero if S and T are identically zero
+ *          type = S if T is zero
+ *          type = T if S is zero
+ *          type = S if neither is zero (S == T always in thius case)
+ */
+template <typename S, typename T>
+struct is_zero_and {
+private:
+  static constexpr bool is_S_zero = std::is_same_v<S, zero>;
+  static constexpr bool is_T_zero = std::is_same_v<T, zero>;
+  using helper_t                  = typename std::conditional<is_S_zero, T, S>::type;
+
+public:
+  /// @brief type corresponding to the rules described in brief above.
+  using type = typename std::conditional<is_S_zero && is_T_zero, zero, helper_t>::type;
+};
+
+/**
+ * @brief a convenience function that computes a dot product between
+ * two tensors, and allows the user to specify which indices to summed
+ * over.  This overload is void, and accepts a pointer to the resultant
+ * tensor.  Useful for GPU execution, where buffers must be filled
+ * manuall.
+ *
+ * @tparam i1 the index of contraction for the left operand
+ * @tparam i2 the index of contraction for the right operand
+ * @tparam S the datatype stored in the left operand
+ * @tparam m leading dimension of the left operand
+ * @tparam n the trailing dimensions of the left operand
+ * @tparam T the datatype stored in the right operand
+ * @tparam p the number of rows in the right operand
+ * @tparam q the number of columns in the right operand
+ * @tparam n0 the first dimension of the result
+ * @tparam n1 the second dimension of the result
+ * @tparam n2 the third dimension of the result
+ * @param A the left operand
+ * @param B the right operand
+ * @param C the result
+ * @param qx thread index for GPU backends
+ * @param qy thread index for GPU backends
+ * @param qz thread index for GPU backends
+ * @param accumulate control whether contract adds the result to C, or sets C.
+ */
+template <int l1, int l2, typename S, int m, typename T, int p, int q, int n0, int n1, int n2, int... n>
+SERAC_HOST_DEVICE void contract(const tensor<S, m, n...>& A, const tensor<T, p, q>& B, tensor<S, n0, n1, n2>* C, int qx,
+                                int qy, int qz, bool accumulate = false)
+{
+  constexpr int Adims[] = {m, n...};
+  constexpr int Bdims[] = {p, q};
+  static_assert(sizeof...(n) < 3);
+  static_assert(Adims[l1] == Bdims[l2], "error: incompatible tensor dimensions");
+  static_assert(sizeof...(n) != 1);
+  using U = decltype(S{} * T{});
+  for (int i = qz; i < n0; i += BLOCK_Z) {
+    for (int j = qy; j < n1; j += BLOCK_Y) {
+      for (int k = qx; k < n2; k += BLOCK_X) {
+        U sum = 0.0;
+
+        for (int l = 0; l < Adims[l1]; l++) {
+          if constexpr (l1 == 0 && l2 == 0) {
+            sum += A(l, j, k) * B(l, i);
+          }
+          if constexpr (l1 == 1 && l2 == 0) {
+            sum += A(i, l, k) * B(l, j);
+          }
+          if constexpr (l1 == 2 && l2 == 0) {
+            sum += A(i, j, l) * B(l, k);
+          }
+          if constexpr (l1 == 0 && l2 == 1) {
+            sum += A(l, j, k) * B(i, l);
+          }
+          if constexpr (l1 == 1 && l2 == 1) {
+            sum += A(i, l, k) * B(j, l);
+          }
+          if constexpr (l1 == 2 && l2 == 1) {
+            sum += A(i, j, l) * B(k, l);
+          }
+        }
+        if (accumulate) {
+          (*C)(i, j, k) += sum;
+        } else {
+          (*C)(i, j, k) = sum;
+        }
+      }
+    }
+  }
+}
+
+/// @overload
+template <int l1, int l2, typename T, int p, int q>
+SERAC_HOST_DEVICE void contract(const zero&, const tensor<T, p, q>&, zero*, int, int, int,
+                                [[maybe_unused]] bool accumulate = false)
+{
+  return;
+}
+
+/// @overload
+template <int l1, int l2>
+SERAC_HOST_DEVICE void contract(const zero&, const zero&, zero*, int, int, int,
+                                [[maybe_unused]] bool accumulate = false)
+{
+  return;
+}
+
+/// @overload
+template <int l1, int l2, typename T, int... n>
+SERAC_HOST_DEVICE void contract(const tensor<T, n...>&, const zero&, zero*, int, int, int,
+                                [[maybe_unused]] bool accumulate = false)
+{
+  return;
+}
+
+/// @overload
+template <int l1, int l2, typename S, typename T, int p, int q, int n0, int n1, int n2>
+SERAC_HOST_DEVICE void contract(const zero&, const tensor<T, p, q>&, tensor<S, n0, n1, n2>*, int, int, int,
+                                [[maybe_unused]] bool accumulate = false)
+{
+  return;
 }
 
 /// @overload
@@ -1625,6 +1846,7 @@ SERAC_HOST_DEVICE constexpr tensor<double, 3, 3> inv(const tensor<double, 3, 3>&
 
   return invA;
 }
+
 /**
  * @overload
  * @note For N-by-N matrices with N > 3, requires Gaussian elimination
@@ -1678,8 +1900,8 @@ inline SERAC_HOST_DEVICE void print(double value) { printf("%f", value); }
  * @brief print a tensor using `printf`, so that it is suitable for use inside cuda kernels.
  * @param[in] A The tensor to write out
  */
-template <int m, int... n>
-SERAC_HOST_DEVICE void print(const tensor<double, m, n...>& A)
+template <typename T, int m, int... n>
+SERAC_HOST_DEVICE void print(const tensor<T, m, n...>& A)
 {
   printf("{");
   print(A[0]);
@@ -1687,7 +1909,58 @@ SERAC_HOST_DEVICE void print(const tensor<double, m, n...>& A)
     printf(",");
     print(A[i]);
   }
-  printf("}");
+  printf("}\n");
+}
+
+/**
+ * @brief print a tensor's shape using printf for use inside CUDA kenels
+ */
+template <typename T, int m, int... n>
+SERAC_HOST_DEVICE void print_shape(const tensor<T, m, n...>&)
+{
+  printf("shape [ %d", m);
+  (printf(", %d", n), ...);
+  printf(" ]\n");
+}
+
+/**
+ * @brief helper function for setting a tensor to a value on GPU
+ * @param[in] A the tensor
+ * @param value to fill tensor
+ * @param qx thread index
+ * @param qy thread index
+ * @param qz thread index
+ */
+template <typename T, int m, int... n>
+SERAC_HOST_DEVICE void memset_tensor(tensor<T, m, n...>& A, T value, int qx, int qy, int qz)
+{
+  for (int i = 0; i < m; ++i) {
+    memset_tensor(A[i], value, qx, qy, qz);
+  }
+}
+
+/// @overload
+template <typename T>
+SERAC_HOST_DEVICE void memset_tensor(serac::zero&, T, int, int, int)
+{
+}
+
+/// @overload
+template <typename T, int m, int n>
+SERAC_HOST_DEVICE void memset_tensor(tensor<T, m, n>& A, T value, int qx, int qy, int)
+{
+  if (qx < m && qy < n) {
+    A(qx, qy) = value;
+  }
+}
+
+/// @overload
+template <typename T, int m, int n, int l>
+SERAC_HOST_DEVICE void memset_tensor(tensor<T, m, n, l>& A, T value, int qx, int qy, int qz)
+{
+  if (qx < m && qy < n && qz < l) {
+    A(qx, qy, qz) = value;
+  }
 }
 
 /**
@@ -1881,6 +2154,28 @@ SERAC_HOST_DEVICE constexpr int size(const tensor<T, n...>&)
   return (n * ... * 1);
 }
 
+/**
+ * @brief returns the total number of stored values in a tensor
+ *
+ * @tparam T the datatype stored in the tensor
+ * @tparam n the extents of each dimension
+ * @return the total number of values stored in the tensor
+ */
+template <typename T, int... n>
+SERAC_HOST_DEVICE constexpr int size(const tensor<T, n...>*)
+{
+  return (n * ... * 1);
+}
+
+/**
+ * @brief returns the total number of stored values in a tensor
+ *
+ * @tparam T the datatype stored in the tensor
+ * @tparam n the extents of each dimension
+ * @return the total number of values stored in the tensor
+ */
+SERAC_HOST_DEVICE constexpr int size(const zero*) { return 1; }
+
 /**
  * @overload
  * @brief overload of size() for `double`, we say a double "stores" 1 value
@@ -1888,7 +2183,7 @@ SERAC_HOST_DEVICE constexpr int size(const tensor<T, n...>&)
 SERAC_HOST_DEVICE constexpr int size(const double&) { return 1; }
 
 /// @overload
-SERAC_HOST_DEVICE constexpr int size(zero) { return 0; }
+SERAC_HOST_DEVICE constexpr int size(zero) { return 1; }
 
 /**
  * @brief a function for querying the ith dimension of a tensor
@@ -1931,6 +2226,37 @@ bool isnan(const tensor<T, n...>& A)
 /// @overload
 inline bool isnan(const zero&) { return false; }
 
+/**
+ * @brief Helper function that uses umpire to move memory between HOST and DEVICE.
+ * destination must be either "HOST" or "DEVICE".
+ */
+template <typename DataType>
+DataType* copy_data(DataType* source_data, std::size_t size, const std::string& destination)
+{
+  auto& rm             = umpire::ResourceManager::getInstance();
+  auto  dest_allocator = rm.getAllocator(destination);
+
+  DataType* dest_data = static_cast<DataType*>(dest_allocator.allocate(size));
+
+  umpire::register_external_allocation(
+      source_data, umpire::util::AllocationRecord(source_data, size, rm.getAllocator("HOST").getAllocationStrategy(),
+                                                  std::string("external array")));
+  rm.copy(dest_data, source_data);
+
+  return dest_data;
+}
+
+/**
+ * @brief Helper function to deallocate memory allocated with umpire.
+ */
+template <typename DataType>
+void deallocate(DataType* data, const std::string& destination)
+{
+  auto& rm             = umpire::ResourceManager::getInstance();
+  auto  dest_allocator = rm.getAllocator(destination);
+  dest_allocator.deallocate(data);
+}
+
 }  // namespace serac
 
 #if 0
@@ -2033,7 +2359,7 @@ inline mat < 2, 2 > look_at(const vec < 2 > & direction) {
 
 inline mat < 3, 3 > R3_basis(const vec3 & n) {
   float sign = (n[2] >= 0.0f) ? 1.0f : -1.0f;
-  float a = -1.0f / (sign + n[2]); 
+  float a = -1.0f / (sign + n[2]);
   float b = n[0] * n[1] * a;
 
   return mat < 3, 3 >{
diff --git a/src/serac/numerics/functional/tests/CMakeLists.txt b/src/serac/numerics/functional/tests/CMakeLists.txt
index 9118582f2..6fe996c24 100644
--- a/src/serac/numerics/functional/tests/CMakeLists.txt
+++ b/src/serac/numerics/functional/tests/CMakeLists.txt
@@ -5,6 +5,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 
 set(functional_test_depends gtest serac_functional ${functional_depends})
+blt_list_append(TO functional_depends ELEMENTS blt::cuda IF ENABLE_CUDA)
 
 # This test is all constexpr-evaluated, so it doesn't
 # actually need to run, if it compiles without error, the tests have passed
@@ -58,15 +59,21 @@ target_link_libraries(bug_boundary_qoi PUBLIC serac_physics)
 
 if(ENABLE_CUDA)
 
-    set(functional_cuda_test_sources 
-        tensor_unit_tests_cuda.cu 
-#        some of the GPU functionality is temporarily disabled to 
-#        help incrementally roll-out the variadic implementation of Functional
-#        TODO: re-enable GPU kernels in a follow-up PR
-#        functional_comparisons_cuda.cu
-       )
+    set(functional_tests_cuda
+    # functional_basic_hcurl.cpp
+    # functional_with_domain.cpp
+    functional_basic_h1_scalar.cpp
+    # functional_basic_h1_vector.cpp
+    # functional_multiphysics.cpp
+    # functional_qoi.cpp
+    # functional_nonlinear.cpp
+    # functional_boundary_test.cpp
+    functional_comparisons.cpp
+    # functional_comparison_L2.cpp
+    )
 
-    serac_add_tests( SOURCES    ${functional_cuda_test_sources}
-                     DEPENDS_ON ${functional_test_depends})
+    serac_add_tests( SOURCES ${functional_tests_cuda}
+                     USE_CUDA ON
+                     DEPENDS_ON gtest serac_functional serac_state ${functional_depends} blt::cuda)
 
 endif()
diff --git a/src/serac/numerics/functional/tests/check_gradient.hpp b/src/serac/numerics/functional/tests/check_gradient.hpp
index fb48070f3..c3cabe2a6 100644
--- a/src/serac/numerics/functional/tests/check_gradient.hpp
+++ b/src/serac/numerics/functional/tests/check_gradient.hpp
@@ -25,7 +25,8 @@ void check_gradient(serac::Functional<T, exec>& f, double t, mfem::Vector& U, do
   }
   dU.Randomize(seed);
 
-  auto [value, dfdU]                                = f(t, serac::differentiate_wrt(U));
+  auto [value, dfdU] = f(t, serac::differentiate_wrt(U));
+
   std::unique_ptr<mfem::HypreParMatrix> dfdU_matrix = assemble(dfdU);
 
   // jacobian vector products
@@ -54,6 +55,8 @@ void check_gradient(serac::Functional<T, exec>& f, double t, mfem::Vector& U, do
 
   // forward-difference approximations
   mfem::Vector df1_fd[2];
+  df1_fd[0].UseDevice(true);
+  df1_fd[1].UseDevice(true);
   df1_fd[0] = f_values[4];
   df1_fd[0] -= f_values[2];
   df1_fd[0] /= (2.0 * epsilon);
@@ -64,6 +67,8 @@ void check_gradient(serac::Functional<T, exec>& f, double t, mfem::Vector& U, do
 
   // center-difference approximations
   mfem::Vector df1_cd[2];
+  df1_cd[0].UseDevice(true);
+  df1_cd[1].UseDevice(true);
   df1_cd[0] = f_values[4];
   df1_cd[0] -= f_values[0];
   df1_cd[0] /= (4.0 * epsilon);
@@ -78,14 +83,14 @@ void check_gradient(serac::Functional<T, exec>& f, double t, mfem::Vector& U, do
 
   // halving epsilon should make the error decrease
   // by about a factor of two for the forward-difference stencil
-  double e1 = df1_fd[0].DistanceTo(df_jvp1.GetData()) / denominator;
-  double e2 = df1_fd[1].DistanceTo(df_jvp1.GetData()) / denominator;
+  double e1 = df_jvp1.DistanceTo(df1_fd[0].HostRead()) / denominator;
+  double e2 = df_jvp1.DistanceTo(df1_fd[1].HostRead()) / denominator;
   EXPECT_TRUE(fabs(e1 / e2 - 2.0) < 0.1 || fmin(e1, e2) < 1.0e-9);
 
   // halving epsilon should make the error decrease
   // by about a factor of four for the center-difference stencil
-  double e3 = df1_cd[0].DistanceTo(df_jvp1.GetData()) / denominator;
-  double e4 = df1_cd[1].DistanceTo(df_jvp1.GetData()) / denominator;
+  double e3 = df_jvp1.DistanceTo(df1_cd[0].HostRead()) / denominator;
+  double e4 = df_jvp1.DistanceTo(df1_cd[1].HostRead()) / denominator;
   EXPECT_TRUE((fabs(e3 / e4 - 4.0) < 0.1) || fmin(e3, e4) < 1.0e-9);
 }
 
@@ -96,9 +101,11 @@ void check_gradient(serac::Functional<T>& f, double t, const mfem::Vector& U, co
   int seed = 42;
 
   mfem::Vector dU(U.Size());
+  dU.UseDevice(true);
   dU.Randomize(seed);
 
   mfem::Vector ddU_dt(U.Size());
+  ddU_dt.UseDevice(true);
   ddU_dt.Randomize(seed + 1);
 
   {
diff --git a/src/serac/numerics/functional/tests/functional_basic_h1_scalar.cpp b/src/serac/numerics/functional/tests/functional_basic_h1_scalar.cpp
index 7a4fc6118..c3466add6 100644
--- a/src/serac/numerics/functional/tests/functional_basic_h1_scalar.cpp
+++ b/src/serac/numerics/functional/tests/functional_basic_h1_scalar.cpp
@@ -69,7 +69,7 @@ void thermal_test_impl(std::unique_ptr<mfem::ParMesh>& mesh)
   auto [trial_fespace, trial_fec] = serac::generateParFiniteElementSpace<trial_space>(mesh.get());
 
   mfem::Vector U(trial_fespace->TrueVSize());
-
+  U.UseDevice(true);
   mfem::ParGridFunction     U_gf(trial_fespace.get());
   mfem::FunctionCoefficient x_squared([](mfem::Vector x) { return x[0] * x[0]; });
   U_gf.ProjectCoefficient(x_squared);
@@ -115,7 +115,7 @@ TEST(mixed, thermal_tets_and_hexes) { thermal_test<2, 1>("/data/meshes/patch3D_t
 
 int main(int argc, char* argv[])
 {
-  serac::accelerator::initializeDevice();
+  serac::accelerator::initializeDevice(exec_space);
   ::testing::InitGoogleTest(&argc, argv);
 
   int num_procs, myid;
@@ -128,7 +128,7 @@ int main(int argc, char* argv[])
   int result = RUN_ALL_TESTS();
 
   MPI_Finalize();
-  serac::accelerator::terminateDevice();
+  serac::accelerator::terminateDevice(exec_space);
 
   return result;
 }
diff --git a/src/serac/numerics/functional/tests/functional_comparisons.cpp b/src/serac/numerics/functional/tests/functional_comparisons.cpp
index 99f4ae9a2..952edf416 100644
--- a/src/serac/numerics/functional/tests/functional_comparisons.cpp
+++ b/src/serac/numerics/functional/tests/functional_comparisons.cpp
@@ -200,6 +200,9 @@ void functional_test(mfem::ParMesh& mesh, H1<p> test, H1<p> trial, Dimension<dim
   // Ensure the two methods generate the same result
   EXPECT_NEAR(0.0, diff1.Norml2() / g1.Norml2(), 1.e-14);
   EXPECT_NEAR(0.0, diff2.Norml2() / g1.Norml2(), 1.e-14);
+#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
+  printCUDAMemUsage();
+#endif
 }
 
 // this test sets up a toy "elasticity" problem where the residual includes contributions
@@ -313,6 +316,9 @@ void functional_test(mfem::ParMesh& mesh, H1<p, dim> test, H1<p, dim> trial, Dim
 
   EXPECT_NEAR(0., diff1.Norml2() / g1.Norml2(), 1.e-14);
   EXPECT_NEAR(0., diff2.Norml2() / g1.Norml2(), 1.e-14);
+#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
+  printCUDAMemUsage();
+#endif
 }
 
 // this test sets up part of a toy "magnetic diffusion" problem where the residual includes contributions
@@ -425,6 +431,9 @@ void functional_test(mfem::ParMesh& mesh, Hcurl<p> test, Hcurl<p> trial, Dimensi
   }
   EXPECT_NEAR(0., diff1.Norml2() / g1.Norml2(), 1.e-13);
   EXPECT_NEAR(0., diff2.Norml2() / g1.Norml2(), 1.e-13);
+#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
+  printCUDAMemUsage();
+#endif
 }
 
 #ifndef SERAC_USE_CUDA_KERNEL_EVALUATION
@@ -455,7 +464,7 @@ TEST(Elasticity, 3DCubic) { functional_test(*mesh3D, H1<3, 3>{}, H1<3, 3>{}, Dim
 
 int main(int argc, char* argv[])
 {
-  serac::accelerator::initializeDevice();
+  serac::accelerator::initializeDevice(exec_space);
   ::testing::InitGoogleTest(&argc, argv);
   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
@@ -494,6 +503,6 @@ int main(int argc, char* argv[])
   int result = RUN_ALL_TESTS();
   MPI_Finalize();
 
-  serac::accelerator::terminateDevice();
+  serac::accelerator::terminateDevice(exec_space);
   return result;
 }
diff --git a/src/serac/numerics/functional/tests/functional_shape_derivatives.cpp b/src/serac/numerics/functional/tests/functional_shape_derivatives.cpp
index b79841bc9..e8ac9b70a 100644
--- a/src/serac/numerics/functional/tests/functional_shape_derivatives.cpp
+++ b/src/serac/numerics/functional/tests/functional_shape_derivatives.cpp
@@ -67,6 +67,7 @@
 #include "mfem.hpp"
 
 #include "axom/slic/core/SimpleLogger.hpp"
+#include "serac/infrastructure/accelerator.hpp"
 #include "serac/infrastructure/input.hpp"
 #include "serac/serac_config.hpp"
 #include "serac/mesh/mesh_utils_base.hpp"
@@ -88,7 +89,7 @@ std::unique_ptr<mfem::ParMesh> mesh2D;
 std::unique_ptr<mfem::ParMesh> mesh3D;
 
 template <int p, typename T, int dim>
-auto monomials(tensor<T, dim> X)
+SERAC_HOST_DEVICE auto monomials(tensor<T, dim> X)
 {
   if constexpr (dim == 2) {
     tensor<T, ((p + 1) * (p + 2)) / 2> output;
@@ -122,7 +123,7 @@ auto monomials(tensor<T, dim> X)
 }
 
 template <int p, typename T, int dim>
-auto grad_monomials([[maybe_unused]] tensor<T, dim> X)
+SERAC_HOST_DEVICE auto grad_monomials([[maybe_unused]] tensor<T, dim> X)
 {
   if constexpr (dim == 2) {
     tensor<T, ((p + 1) * (p + 2)) / 2, 2> output;
diff --git a/src/serac/numerics/functional/tests/functional_with_domain.cpp b/src/serac/numerics/functional/tests/functional_with_domain.cpp
index cdefd63c5..4b0193ddf 100644
--- a/src/serac/numerics/functional/tests/functional_with_domain.cpp
+++ b/src/serac/numerics/functional/tests/functional_with_domain.cpp
@@ -159,6 +159,8 @@ void whole_mesh_comparison_test(std::string meshfile)
   }
 }
 
+TEST(basic, whole_mesh_comparison_hexes) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch3D_hexes.mesh"); }
+#ifndef SERAC_USE_CUDA_KERNEL_EVALUATION
 TEST(basic, whole_mesh_comparison_tris) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch2D_tris.mesh"); }
 TEST(basic, whole_mesh_comparison_quads) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch2D_quads.mesh"); }
 TEST(basic, whole_mesh_comparison_tris_and_quads)
@@ -167,7 +169,6 @@ TEST(basic, whole_mesh_comparison_tris_and_quads)
 }
 
 TEST(basic, whole_mesh_comparison_tets) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch3D_tets.mesh"); }
-TEST(basic, whole_mesh_comparison_hexes) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch3D_hexes.mesh"); }
 TEST(basic, whole_mesh_comparison_tets_and_hexes)
 {
   whole_mesh_comparison_test<1, 1>("/data/meshes/patch3D_tets_and_hexes.mesh");
@@ -316,11 +317,14 @@ TEST(qoi, partial_domain)
 
   EXPECT_NEAR(volume, 4.0, 1.0e-14);
 }
+#endif
 
 int main(int argc, char* argv[])
 {
   ::testing::InitGoogleTest(&argc, argv);
-
+#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
+  serac::accelerator::initializeDevice();
+#endif
   int num_procs, myid;
   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
@@ -331,6 +335,8 @@ int main(int argc, char* argv[])
   int result = RUN_ALL_TESTS();
 
   MPI_Finalize();
-
+#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION
+  serac::accelerator::terminateDevice();
+#endif
   return result;
 }
diff --git a/src/serac/numerics/functional/tuple.hpp b/src/serac/numerics/functional/tuple.hpp
index bc2b571d0..c78a365cf 100644
--- a/src/serac/numerics/functional/tuple.hpp
+++ b/src/serac/numerics/functional/tuple.hpp
@@ -13,6 +13,7 @@
 
 #include <utility>
 
+#include "serac/serac_config.hpp"
 #include "serac/infrastructure/accelerator.hpp"
 
 namespace serac {
@@ -186,10 +187,44 @@ template <class... Types>
 struct tuple_size {
 };
 
+template <class... Types>
+struct tuple_size_ptr {
+};
+
 template <class... Types>
 struct tuple_size<serac::tuple<Types...>> : std::integral_constant<std::size_t, sizeof...(Types)> {
 };
 
+template <class... Types>
+struct tuple_size_ptr<serac::tuple<Types*...>> : std::integral_constant<std::size_t, sizeof...(Types)> {
+};
+
+/**
+ * @brief returns the total number of stored values in a tensor
+ *
+ * @tparam T the datatype stored in the tensor
+ * @tparam n the extents of each dimension
+ * @return the total number of values stored in the tensor
+ */
+template <class... Types>
+SERAC_HOST_DEVICE constexpr int size(const serac::tuple<Types...>)
+{
+  return tuple_size<serac::tuple<Types...>>::value;
+}
+
+/**
+ * @brief returns the total number of stored values in a tensor
+ *
+ * @tparam T the datatype stored in the tensor
+ * @tparam n the extents of each dimension
+ * @return the total number of values stored in the tensor
+ */
+template <class... Types>
+SERAC_HOST_DEVICE constexpr int size(const serac::tuple<Types*...>)
+{
+  return tuple_size_ptr<serac::tuple<Types*...>>::value;
+}
+
 /**
  * @tparam i the tuple index to access
  * @tparam T the types stored in the tuple
@@ -566,8 +601,8 @@ SERAC_HOST_DEVICE constexpr auto operator*(const tuple<S...>& x, const tuple<T..
  * @param a a constant multiplier
  * @return the returned tuple product
  */
-template <typename... T, int... i>
-SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const tuple<T...>& x, std::integer_sequence<int, i...>)
+template <typename TupleType, int... i>
+SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const TupleType& x, std::integer_sequence<int, i...>)
 {
   return tuple{a * get<i>(x)...};
 }
@@ -581,8 +616,8 @@ SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const tuple<T...>&
  * @param a a constant multiplier
  * @return the returned tuple product
  */
-template <typename... T, int... i>
-SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple<T...>& x, const double a, std::integer_sequence<int, i...>)
+template <typename TupleType, int... i>
+SERAC_HOST_DEVICE constexpr auto mult_helper(const TupleType& x, const double a, std::integer_sequence<int, i...>)
 {
   return tuple{get<i>(x) * a...};
 }
@@ -639,6 +674,33 @@ auto& operator<<(std::ostream& out, const serac::tuple<T...>& A)
   return print_helper(out, A, std::make_integer_sequence<size_t, sizeof...(T)>());
 }
 
+/**
+ * @brief Overload for the print function to print tuples or tensors of tuples.
+          Enables calling print(tensor<tuple<T, U>, ...> a);
+
+ * @tparam T The tuple types
+ * @tparam i The integer sequence to i
+ * @param A the tensor to print
+*/
+template <typename... T, size_t... i>
+SERAC_HOST_DEVICE void print(const serac::tuple<T...>& A, std::integer_sequence<size_t, i...>)
+{
+  (print(get<i>(A)), ...);
+}
+
+/**
+ * @brief Helper function for the above overload
+
+ * @tparam T The tuple types
+ * @tparam i The integer sequence to i
+ * @param A the tensor to print
+*/
+template <typename... T>
+SERAC_HOST_DEVICE void print(const serac::tuple<T...>& A)
+{
+  print(A, std::make_integer_sequence<size_t, sizeof...(T)>());
+}
+
 /**
  * @brief A helper to apply a lambda to a tuple
  *
diff --git a/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp b/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp
index e44a31597..3d335fb87 100644
--- a/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp
+++ b/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp
@@ -1,8 +1,17 @@
+// Copyright (c) 2019-2023, Lawrence Livermore National Security, LLC and
+// other Serac Project Developers. See the top-level LICENSE file for
+// details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+
 #pragma once
 
+#include "serac/infrastructure/accelerator.hpp"
+#include "serac/serac_config.hpp"
 #include "serac/numerics/functional/tuple.hpp"
 #include "serac/numerics/functional/tensor.hpp"
 #include "serac/numerics/functional/dual.hpp"
+#include "RAJA/RAJA.hpp"
 
 #include "mfem.hpp"
 
@@ -212,26 +221,51 @@ SERAC_HOST_DEVICE auto promote_to_dual_when(const T& x)
 }
 
 /**
- * @brief a function that optionally (decided at compile time) converts a list of values to their dual types
+ * @brief Deduce return type of promote_each_to_dual_when.  Useful for type deduction while
+ *        allocating memory
  *
  * @tparam dualify specify whether or not the input should be made into its dual type
  * @tparam T the type of the values passed in
  * @tparam n how many values were passed in
- * @param x the values to be promoted
  */
 template <bool dualify, typename T, int n>
-SERAC_HOST_DEVICE auto promote_each_to_dual_when(const tensor<T, n>& x)
+auto promote_each_to_dual_when_output_helper(const tensor<T, n>&)
 {
   if constexpr (dualify) {
     using return_type = decltype(make_dual(T{}));
-    tensor<return_type, n> output;
-    for (int i = 0; i < n; i++) {
-      output[i] = make_dual(x[i]);
-    }
-    return output;
+    return tensor<return_type, n>{};
   }
   if constexpr (!dualify) {
-    return x;
+    return tensor<T, n>{};
+  }
+}
+
+/**
+ * @brief a function that optionally (decided at compile time) converts a list of values to their dual types
+ *
+ * @tparam dualify specify whether or not the input should be made into its dual type
+ * @tparam T the type of the values passed in
+ * @tparam n how many values were passed in
+ * @param x the values to be promoted
+ * @param output_ptr pointer to output array
+ * @param ctx the RAJA launch context used to synchronize threads and required by the RAJA API.
+ */
+template <bool dualify, ExecutionSpace exec, typename T, int n>
+SERAC_HOST_DEVICE void promote_each_to_dual_when(const tensor<T, n>& x, void* output_ptr,
+                                                 RAJA::LaunchContext ctx = RAJA::LaunchContext{})
+{
+  if constexpr (dualify) {
+    RAJA::RangeSegment x_range(0, n);
+    using return_type      = decltype(make_dual(T{}));
+    auto casted_output_ptr = static_cast<tensor<return_type, n>*>(output_ptr);
+    RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(
+        ctx, x_range, [&](int i) { (*casted_output_ptr)[i] = make_dual(x[i]); });
+  }
+  if constexpr (!dualify) {
+    RAJA::RangeSegment x_range(0, n);
+    auto               casted_output_ptr = static_cast<tensor<T, n>*>(output_ptr);
+    RAJA::loop<typename EvaluationSpacePolicy<exec>::threads_t>(ctx, x_range,
+                                                                [&](int i) { (*casted_output_ptr)[i] = x[i]; });
   }
 }
 
@@ -438,7 +472,8 @@ SERAC_HOST_DEVICE constexpr auto linear_solve(const tensor<S, n, n>& A, const te
 
   if constexpr (is_zero<decltype(dx)>{}) {
     return x;
-  } else {
+  }
+  if constexpr (!is_zero<decltype(dx)>{}) {
     return make_dual(x, dx);
   }
 }
@@ -799,7 +834,7 @@ SERAC_HOST_DEVICE tensor<int, 3> argsort(const tensor<T, 3>& v)
  * @note based on "A robust algorithm for finding the eigenvalues and
  * eigenvectors of 3x3 symmetric matrices", by Scherzinger & Dohrmann
  */
-inline SERAC_HOST_DEVICE tuple<vec3, mat3> eig_symm(const mat3& A)
+inline tuple<vec3, mat3> eig_symm(const mat3& A)
 {
   // We know of optimizations for this routine. When this becomes the
   // bottleneck, we can revisit. See OptimiSM for details.
@@ -971,7 +1006,8 @@ auto symmetric_mat3_function(tensor<T, 3, 3> A, const Function& f, const EigvalS
 
   if constexpr (!is_dual_number<T>::value) {
     return f_A;
-  } else {
+  }
+  if constexpr (is_dual_number<T>::value) {
     return symmetric_mat3_function_with_derivative(A, f_A, lambda, Q, g);
   }
 }
diff --git a/src/serac/physics/materials/liquid_crystal_elastomer.hpp b/src/serac/physics/materials/liquid_crystal_elastomer.hpp
index 17c30f82a..b57686e9f 100644
--- a/src/serac/physics/materials/liquid_crystal_elastomer.hpp
+++ b/src/serac/physics/materials/liquid_crystal_elastomer.hpp
@@ -255,7 +255,7 @@ struct LiquidCrystalElastomerBertoldi {
    * @return The calculated material response (Cauchy stress) for the material
    */
   template <typename DispGradType, typename OrderParamType, typename GammaAngleType, typename EtaAngleType>
-  SERAC_HOST_DEVICE auto operator()(State& /*state*/, const tensor<DispGradType, dim, dim>& displacement_grad,
+  SERAC_HOST_DEVICE auto operator()(const State& /*state*/, const tensor<DispGradType, dim, dim>& displacement_grad,
                                     OrderParamType inst_order_param_tuple, GammaAngleType gamma_tuple,
                                     EtaAngleType eta_tuple) const
   {
diff --git a/src/serac/physics/materials/parameterized_solid_material.hpp b/src/serac/physics/materials/parameterized_solid_material.hpp
index f812c4b3a..c5f963d57 100644
--- a/src/serac/physics/materials/parameterized_solid_material.hpp
+++ b/src/serac/physics/materials/parameterized_solid_material.hpp
@@ -38,7 +38,7 @@ struct ParameterizedLinearIsotropicSolid {
    * @return The calculated material response (Cauchy stress) for the material
    */
   template <int dim, typename DispGradType, typename BulkType, typename ShearType>
-  SERAC_HOST_DEVICE auto operator()(State& /*state*/, const serac::tensor<DispGradType, dim, dim>& du_dX,
+  SERAC_HOST_DEVICE auto operator()(const State& /*state*/, const serac::tensor<DispGradType, dim, dim>& du_dX,
                                     const BulkType& DeltaK, const ShearType& DeltaG) const
   {
     constexpr auto I       = Identity<dim>();
@@ -81,7 +81,7 @@ struct ParameterizedNeoHookeanSolid {
    * @return The calculated material response (Cauchy stress) for the material
    */
   template <int dim, typename DispGradType, typename BulkType, typename ShearType>
-  SERAC_HOST_DEVICE auto operator()(State& /*state*/, const serac::tensor<DispGradType, dim, dim>& du_dX,
+  SERAC_HOST_DEVICE auto operator()(const State& /*state*/, const serac::tensor<DispGradType, dim, dim>& du_dX,
                                     const BulkType& DeltaK, const ShearType& DeltaG) const
   {
     using std::log1p;
diff --git a/src/serac/physics/materials/solid_material.hpp b/src/serac/physics/materials/solid_material.hpp
index 1b0b2d7b8..5fac54b87 100644
--- a/src/serac/physics/materials/solid_material.hpp
+++ b/src/serac/physics/materials/solid_material.hpp
@@ -36,7 +36,7 @@ struct LinearIsotropic {
    * @return The Cauchy stress
    */
   template <typename T, int dim>
-  SERAC_HOST_DEVICE auto operator()(State& /* state */, const tensor<T, dim, dim>& du_dX) const
+  SERAC_HOST_DEVICE auto operator()(const State& /* state */, const tensor<T, dim, dim>& du_dX) const
   {
     auto I       = Identity<dim>();
     auto lambda  = K - (2.0 / 3.0) * G;
@@ -72,7 +72,7 @@ struct StVenantKirchhoff {
    * @return The Cauchy stress
    */
   template <typename T, int dim>
-  auto operator()(State&, const tensor<T, dim, dim>& grad_u) const
+  auto operator()(const State&, const tensor<T, dim, dim>& grad_u) const
   {
     static constexpr auto I = Identity<dim>();
     auto                  F = grad_u + I;
@@ -110,7 +110,7 @@ struct NeoHookean {
    * @return The Cauchy stress
    */
   template <typename T, int dim>
-  SERAC_HOST_DEVICE auto operator()(State& /* state */, const tensor<T, dim, dim>& du_dX) const
+  SERAC_HOST_DEVICE auto operator()(const State& /* state */, const tensor<T, dim, dim>& du_dX) const
   {
     using std::log1p;
     constexpr auto I         = Identity<dim>();
diff --git a/src/serac/physics/tests/quasistatic_solid_adjoint.cpp b/src/serac/physics/tests/quasistatic_solid_adjoint.cpp
index bf13b2868..5962f5686 100644
--- a/src/serac/physics/tests/quasistatic_solid_adjoint.cpp
+++ b/src/serac/physics/tests/quasistatic_solid_adjoint.cpp
@@ -22,7 +22,7 @@ struct ParameterizedLinearIsotropicSolid {
   using State = ::serac::Empty;  ///< this material has no internal variables
 
   template <int dim, typename T1, typename T2, typename T3>
-  SERAC_HOST_DEVICE auto operator()(State&, const ::serac::tensor<T1, dim, dim>& u_grad, const T2& E_tuple,
+  SERAC_HOST_DEVICE auto operator()(const State&, const ::serac::tensor<T1, dim, dim>& u_grad, const T2& E_tuple,
                                     const T3& v_tuple) const
   {
     auto       E      = ::serac::get<0>(E_tuple);                 // Young's modulus VALUE
@@ -40,7 +40,7 @@ struct ParameterizedNeoHookeanSolid {
   using State = ::serac::Empty;  // this material has no internal variables
 
   template <int dim, typename T1, typename T2, typename T3>
-  SERAC_HOST_DEVICE auto operator()(State&, const ::serac::tensor<T1, dim, dim>& du_dX, const T2& E_tuple,
+  SERAC_HOST_DEVICE auto operator()(const State&, const ::serac::tensor<T1, dim, dim>& du_dX, const T2& E_tuple,
                                     const T3& v_tuple) const
   {
     using std::log1p;