diff --git a/.gitignore b/.gitignore index 6db11dcac..13f267d9d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,5 +13,6 @@ *.orig __pycache__/ view +*.cache* /_serac_build_and_test* build-linux-*-*-* diff --git a/.gitlab/build_blueos.yml b/.gitlab/build_blueos.yml index eff30dd57..acca63b60 100644 --- a/.gitlab/build_blueos.yml +++ b/.gitlab/build_blueos.yml @@ -45,7 +45,7 @@ blueos-clang_10_0_1-src: HOST_CONFIG: "lassen-blueos_3_ppc64le_ib_p9-${COMPILER}_cuda.cmake" EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON -DENABLE_DOCS=OFF" ALLOC_NODES: "1" - ALLOC_TIME: "30" + ALLOC_TIME: "45" extends: [.src_build_on_blueos, .with_cuda] # Note: to reduce duplication SPEC is not defined here, if we move to more than one diff --git a/.gitlab/build_toss4.yml b/.gitlab/build_toss4.yml index c4f83eabc..0b46b6e22 100644 --- a/.gitlab/build_toss4.yml +++ b/.gitlab/build_toss4.yml @@ -2,7 +2,7 @@ # This is the shared configuration of jobs for toss4 .on_toss4: variables: - SCHEDULER_PARAMETERS: "--res=ci --exclusive=user --deadline=now+1hour -N ${ALLOC_NODES} -t ${ALLOC_TIME} -A ${ALLOC_BANK}" + SCHEDULER_PARAMETERS: "--res=ci --exclusive=user --deadline=now+90minutes -N ${ALLOC_NODES} -t ${ALLOC_TIME} -A ${ALLOC_BANK}" tags: - batch - ruby @@ -40,7 +40,7 @@ toss4-clang_14_0_6-src: EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON" DO_INTEGRATION_TESTS: "yes" ALLOC_NODES: "2" - ALLOC_TIME: "30" + ALLOC_TIME: "45" extends: .src_build_on_toss4 toss4-gcc_10_3_1-src: @@ -49,7 +49,7 @@ toss4-gcc_10_3_1-src: HOST_CONFIG: "ruby-toss_4_x86_64_ib-${COMPILER}.cmake" EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON" ALLOC_NODES: "1" - ALLOC_TIME: "30" + ALLOC_TIME: "45" extends: .src_build_on_toss4 toss4-gcc_10_3_1-src-no-tribol: @@ -58,7 +58,7 @@ toss4-gcc_10_3_1-src-no-tribol: HOST_CONFIG: "ruby-toss_4_x86_64_ib-${COMPILER}.cmake" EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON -UTRIBOL_DIR" ALLOC_NODES: "1" - ALLOC_TIME: "30" + ALLOC_TIME: "45" extends: .src_build_on_toss4 toss4-gcc_10_3_1-src-no-optional-solvers: @@ -67,7 +67,7 @@ toss4-gcc_10_3_1-src-no-optional-solvers: HOST_CONFIG: "ruby-toss_4_x86_64_ib-${COMPILER}.cmake" EXTRA_CMAKE_OPTIONS: "-DENABLE_BENCHMARKS=ON -USUNDIALS_DIR -UPETSC_DIR" ALLOC_NODES: "1" - ALLOC_TIME: "20" + ALLOC_TIME: "45" extends: .src_build_on_toss4 toss4-clang_14_0_6-full: diff --git a/src/serac/infrastructure/accelerator.cpp b/src/serac/infrastructure/accelerator.cpp index cdce5eba4..cfa670dd3 100644 --- a/src/serac/infrastructure/accelerator.cpp +++ b/src/serac/infrastructure/accelerator.cpp @@ -8,8 +8,6 @@ #include -#include "mfem.hpp" - #include "serac/infrastructure/logger.hpp" namespace serac { @@ -21,19 +19,23 @@ namespace { std::unique_ptr device; } // namespace -void initializeDevice() +void initializeDevice(ExecutionSpace exec) { SLIC_ERROR_ROOT_IF(device, "serac::accelerator::initializeDevice cannot be called more than once"); device = std::make_unique(); -#if defined(MFEM_USE_CUDA) && defined(SERAC_USE_CUDA_KERNEL_EVALUATION) - device->Configure("cuda"); + if (exec == ExecutionSpace::GPU) { +#if defined(MFEM_USE_CUDA) + device->Configure("cuda"); #endif + } } -void terminateDevice() +void terminateDevice(ExecutionSpace exec) { // Idempotent, no adverse affects if called multiple times - device.reset(); + if (exec == ExecutionSpace::GPU) { + device.reset(); + } } } // namespace accelerator diff --git a/src/serac/infrastructure/accelerator.hpp b/src/serac/infrastructure/accelerator.hpp index 180416d0d..b5c007d61 100644 --- a/src/serac/infrastructure/accelerator.hpp +++ b/src/serac/infrastructure/accelerator.hpp @@ -12,7 +12,6 @@ */ #pragma once - #if defined(__CUDACC__) #define SERAC_HOST_DEVICE __host__ __device__ #define SERAC_HOST __host__ @@ -49,7 +48,7 @@ */ #define SERAC_SUPPRESS_NVCC_HOSTDEVICE_WARNING #endif - +#include "RAJA/RAJA.hpp" #include #include "axom/core.hpp" @@ -72,6 +71,40 @@ enum class ExecutionSpace Dynamic // Corresponds to execution that can "legally" happen on either the host or device }; +/** + * @brief Helper struct used for deducing RAJA exectuion policies from serac::ExecutionSpace + */ +template +struct EvaluationSpacePolicy; + +/// @overload +template <> +struct EvaluationSpacePolicy { + /// @brief Alias for loop policy. + using threads_t = RAJA::LoopPolicy; + /// @brief Alias for number of teams for CPU kernel launches. + using teams_t = RAJA::LoopPolicy; + /// @brief Alias for CPU kernel launch policy. + using launch_t = RAJA::LaunchPolicy; + /// @brief Alias for forall policy. + using forall_t = RAJA::seq_exec; +}; + +#if defined(__CUDACC__) +/// @overload +template <> +struct EvaluationSpacePolicy { + /// @brief Alias for loop policy. + using threads_t = RAJA::LoopPolicy; + /// @brief Alias for number of teams for GPU kernel launches. + using teams_t = RAJA::LoopPolicy; + /// @brief Alias for GPU kernel launch policy. + using launch_t = RAJA::LaunchPolicy>; + /// @brief Alias for forall policy. + using forall_t = RAJA::cuda_exec<128>; +}; +#endif + /** * @brief The default execution space for serac builds */ @@ -88,25 +121,33 @@ struct execution_to_memory { static constexpr axom::MemorySpace value = axom::MemorySpace::Dynamic; }; -#ifdef SERAC_USE_UMPIRE +/// @brief This helper is needed to suppress -Werror compilation errors caused by the +/// explicit captures in the main execution lambdas. +template +SERAC_HOST_DEVICE void suppress_capture_warnings(T...) +{ +} + /// @overload template <> struct execution_to_memory { + /// @brief axom memory space corresponding to ExecutionSpace static constexpr axom::MemorySpace value = axom::MemorySpace::Host; }; /// @overload template <> struct execution_to_memory { + /// @brief axom memory space corresponding to ExecutionSpace static constexpr axom::MemorySpace value = axom::MemorySpace::Device; }; /// @overload template <> struct execution_to_memory { + /// @brief axom memory space corresponding to ExecutionSpace static constexpr axom::MemorySpace value = axom::MemorySpace::Unified; }; -#endif /// @brief Helper template for @p execution_to_memory trait template @@ -121,7 +162,7 @@ void zero_out(axom::Array& arr) #ifdef __CUDACC__ /// @overload template -void zero_out(axom::Array>& arr) +void zero_out(axom::Array& arr) { cudaMemset(arr.data(), 0, static_cast(arr.size()) * sizeof(T)); } @@ -191,12 +232,12 @@ namespace accelerator { * * @note This function should only be called once */ -void initializeDevice(); +void initializeDevice(ExecutionSpace exec = ExecutionSpace::CPU); /** * @brief Cleans up the device, if applicable */ -void terminateDevice(); +void terminateDevice(ExecutionSpace exec = ExecutionSpace::CPU); #if defined(__CUDACC__) diff --git a/src/serac/infrastructure/debug_print.hpp b/src/serac/infrastructure/debug_print.hpp index 485f058af..edc618517 100644 --- a/src/serac/infrastructure/debug_print.hpp +++ b/src/serac/infrastructure/debug_print.hpp @@ -131,16 +131,18 @@ void printCUDAMemUsage() { int deviceCount = 0; cudaGetDeviceCount(&deviceCount); - int i = 0; - cudaSetDevice(i); + for (int i = 0; i < deviceCount; ++i) { + cudaSetDevice(i); - size_t freeBytes, totalBytes; - cudaMemGetInfo(&freeBytes, &totalBytes); - size_t usedBytes = totalBytes - freeBytes; + size_t freeBytes, totalBytes; + cudaMemGetInfo(&freeBytes, &totalBytes); + size_t usedBytes = totalBytes - freeBytes; - std::cout << "Device Number: " << i << std::endl; - std::cout << " Total Memory (MB): " << (totalBytes / 1024.0 / 1024.0) << std::endl; - std::cout << " Free Memory (MB): " << (freeBytes / 1024.0 / 1024.0) << std::endl; - std::cout << " Used Memory (MB): " << (usedBytes / 1024.0 / 1024.0) << std::endl; + std::cout << "Device Number: " << i << std::endl; + std::cout << " Total Memory (MB): " << (totalBytes / 1024.0 / 1024.0) << std::endl; + std::cout << " Free Memory (MB): " << (freeBytes / 1024.0 / 1024.0) << std::endl; + std::cout << " Used Memory (MB): " << (usedBytes / 1024.0 / 1024.0) << std::endl; + } + cudaSetDevice(0); } #endif diff --git a/src/serac/numerics/functional/CMakeLists.txt b/src/serac/numerics/functional/CMakeLists.txt index bd2f124f4..9d7f03e70 100644 --- a/src/serac/numerics/functional/CMakeLists.txt +++ b/src/serac/numerics/functional/CMakeLists.txt @@ -33,10 +33,9 @@ set(functional_headers tuple_tensor_dual_functions.hpp ) -set(functional_sources - domain.cpp - element_restriction.cpp - geometric_factors.cpp +set(functional_sources + domain.cpp + element_restriction.cpp quadrature_data.cpp) set(functional_detail_headers @@ -66,7 +65,7 @@ blt_list_append(TO functional_headers ELEMENTS ${functional_cuda_headers} IF ENA blt_add_library( NAME serac_functional - HEADERS ${functional_headers} ${functional_detail_headers} + HEADERS ${functional_headers} ${functional_detail_headers} SOURCES ${functional_sources} DEPENDS_ON ${functional_depends} ) diff --git a/src/serac/numerics/functional/boundary_integral_kernels.hpp b/src/serac/numerics/functional/boundary_integral_kernels.hpp index 9af85ba8d..e896f60c4 100644 --- a/src/serac/numerics/functional/boundary_integral_kernels.hpp +++ b/src/serac/numerics/functional/boundary_integral_kernels.hpp @@ -5,12 +5,16 @@ // SPDX-License-Identifier: (BSD-3-Clause) #pragma once +#include #include +#include "serac/infrastructure/accelerator.hpp" #include "serac/serac_config.hpp" #include "serac/numerics/functional/quadrature_data.hpp" #include "serac/numerics/functional/differentiate_wrt.hpp" +#include "RAJA/RAJA.hpp" + namespace serac { namespace boundary_integral { @@ -156,7 +160,7 @@ SERAC_HOST_DEVICE auto batch_apply_qf(lambda qf, double t, const tensor void evaluation_kernel_impl(trial_element_type trial_elements, test_element, double t, const std::vector& inputs, double* outputs, const double* positions, @@ -165,43 +169,89 @@ void evaluation_kernel_impl(trial_element_type trial_elements, test_element, dou { // mfem provides this information as opaque arrays of doubles, // so we reinterpret the pointer with - constexpr int dim = dimension_of(geom) + 1; - constexpr int nqp = num_quadrature_points(geom, Q); - auto J = reinterpret_cast*>(jacobians); - auto x = reinterpret_cast*>(positions); - auto r = reinterpret_cast(outputs); - static constexpr TensorProductQuadratureRule rule{}; + constexpr int dim = dimension_of(geom) + 1; + constexpr int nqp = num_quadrature_points(geom, Q); + using X_Type = const tensor; + using J_Type = const tensor; + auto J = reinterpret_cast(jacobians); + auto x = reinterpret_cast(positions); + auto r = reinterpret_cast(outputs); + constexpr TensorProductQuadratureRule rule{}; - static constexpr int qpts_per_elem = num_quadrature_points(geom, Q); + constexpr int qpts_per_elem = num_quadrature_points(geom, Q); [[maybe_unused]] tuple u = { reinterpret_cast(trial_elements))::dof_type*>(inputs[indices])...}; + using interpolate_out_type = decltype(tuple{get(trial_elements).template interpolate_output_helper()...}); + using qf_inputs_type = decltype(tuple{promote_each_to_dual_when_output_helper( + get(trial_elements).template interpolate_output_helper())...}); + // Determine allocation destination based on exec parameter + std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST"; + auto& rm = umpire::ResourceManager::getInstance(); + auto allocator = rm.getAllocator(device_name); + // Perform allocations + qf_inputs_type* qf_inputs = static_cast(allocator.allocate(sizeof(qf_inputs_type) * num_elements)); + interpolate_out_type* interpolate_result = + static_cast(allocator.allocate(sizeof(interpolate_out_type) * num_elements)); + // Zero out memory + rm.memset(qf_inputs, 0); + rm.memset(interpolate_result, 0); + + auto e_range = RAJA::TypedRangeSegment(0, num_elements); // for each element in the domain - for (uint32_t e = 0; e < num_elements; e++) { - // load the jacobians and positions for each quadrature point in this element - auto J_e = J[e]; - auto x_e = x[e]; - - // batch-calculate values / derivatives of each trial space, at each quadrature point - [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when( - get(trial_elements).interpolate(get(u)[elements[e]], rule))...}; - - // (batch) evalute the q-function at each quadrature point - auto qf_outputs = batch_apply_qf(qf, t, x_e, J_e, get(qf_inputs)...); - - // write out the q-function derivatives after applying the - // physical_to_parent transformation, so that those transformations - // won't need to be applied in the action_of_gradient and element_gradient kernels - if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) { - for (int q = 0; q < leading_dimension(qf_outputs); q++) { - qf_derivatives[e * qpts_per_elem + uint32_t(q)] = get_gradient(qf_outputs[q]); - } - } - - // (batch) integrate the material response against the test-space basis functions - test_element::integrate(get_value(qf_outputs), rule, &r[elements[e]]); - } + RAJA::launch::launch_t>( + RAJA::LaunchParams(RAJA::Teams(static_cast(num_elements)), RAJA::Threads(BLOCK_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop::teams_t>( + ctx, e_range, + // The explicit capture list is needed here because the capture occurs in a function + // template with a variadic non-type parameter. + [&ctx, t, J, x, u, qf, trial_elements, qpts_per_elem, rule, r, elements, qf_derivatives, qf_inputs, + interpolate_result](uint32_t e) { + // These casts are needed to suppres -Werror compilation errors + // caused by the explicit capture above. + (void)qpts_per_elem; + detail::suppress_capture_warnings(qf_derivatives, qpts_per_elem, trial_elements, qf_inputs, + interpolate_result, u); + + // batch-calculate values / derivatives of each trial space, at each quadrature point + (get(trial_elements) + .interpolate(get(u)[elements[e]], rule, &get(interpolate_result[e]), ctx), + ...); + + ctx.teamSync(); + + (promote_each_to_dual_when(get(interpolate_result[e]), + &get(qf_inputs[e]), ctx), + ...); + + ctx.teamSync(); + + // (batch) evalute the q-function at each quadrature point + RAJA_TEAM_SHARED decltype(batch_apply_qf(qf, t, x[e], J[e], get(qf_inputs[e])...)) qf_outputs; + qf_outputs = batch_apply_qf(qf, t, x[e], J[e], get(qf_inputs[e])...); + + ctx.teamSync(); + + // write out the q-function derivatives after applying the + // physical_to_parent transformation, so that those transformations + // won't need to be applied in the action_of_gradient and element_gradient kernels + if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) { + RAJA::RangeSegment x_range(0, leading_dimension(qf_outputs)); + RAJA::loop::threads_t>(ctx, x_range, [&](int q) { + qf_derivatives[e * qpts_per_elem + uint32_t(q)] = get_gradient(qf_outputs[q]); + }); + } + + ctx.teamSync(); + + // (batch) integrate the material response against the test-space basis functions + test_element::integrate(get_value(qf_outputs), rule, &r[elements[e]], ctx); + }); + }); + allocator.deallocate(qf_inputs); + allocator.deallocate(interpolate_result); } //clang-format off @@ -213,14 +263,14 @@ SERAC_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx) } //clang-format on -template -SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor& inputs) +template +SERAC_HOST_DEVICE auto batch_apply_chain_rule( + derivative_type* qf_derivatives, const tensor& inputs, + tensor, n>& outputs, const RAJA::LaunchContext& ctx) { - using return_type = decltype(chain_rule(derivative_type{}, T{})); - tensor, n> outputs{}; - for (int i = 0; i < n; i++) { - get<0>(outputs[i]) = chain_rule(qf_derivatives[i], inputs[i]); - } + RAJA::RangeSegment i_range(0, n); + RAJA::loop::threads_t>( + ctx, i_range, [&](int i) { get<0>(outputs[i]) = chain_rule(qf_derivatives[i], inputs[i]); }); return outputs; } @@ -249,31 +299,53 @@ SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, c * @see mfem::GeometricFactors * @param[in] num_elements The number of elements in the mesh */ -template +template void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* qf_derivatives, const int* elements, std::size_t num_elements) { - using test_element = finite_element; - using trial_element = finite_element; + using test_element = finite_element; + using trial_element = finite_element; + using qf_inputs_type = decltype(trial_element::template interpolate_output_helper()); // mfem provides this information in 1D arrays, so we reshape it // into strided multidimensional arrays before using - constexpr int nqp = num_quadrature_points(geom, Q); - auto du = reinterpret_cast(dU); - auto dr = reinterpret_cast(dR); - static constexpr TensorProductQuadratureRule rule{}; + constexpr int nqp = num_quadrature_points(geom, Q); + std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST"; + constexpr TensorProductQuadratureRule rule{}; + + auto du = reinterpret_cast(dU); + auto dr = reinterpret_cast(dR); + auto e_range = RAJA::TypedRangeSegment(0, num_elements); + + auto& rm = umpire::ResourceManager::getInstance(); + auto allocator = rm.getAllocator(device_name); + qf_inputs_type* qf_inputs = static_cast(allocator.allocate(sizeof(qf_inputs_type) * num_elements)); + rm.memset(qf_inputs, 0); + // This typedef is needed to declare qf_outputs in shared memory. + using qf_outputs_type = + tensor, nqp>; // for each element in the domain - for (uint32_t e = 0; e < num_elements; e++) { - // (batch) interpolate each quadrature point's value - auto qf_inputs = trial_element::interpolate(du[elements[e]], rule); - - // (batch) evalute the q-function at each quadrature point - auto qf_outputs = batch_apply_chain_rule(qf_derivatives + e * nqp, qf_inputs); - - // (batch) integrate the material response against the test-space basis functions - test_element::integrate(qf_outputs, rule, &dr[elements[e]]); - } + RAJA::launch::launch_t>( + RAJA::LaunchParams(RAJA::Teams(static_cast(num_elements)), RAJA::Threads(BLOCK_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop::teams_t>(ctx, e_range, [&](int e) { + // (batch) interpolate each quadrature point's value + trial_element::interpolate(du[elements[e]], rule, &(qf_inputs[e]), ctx); + ctx.teamSync(); + + // (batch) evalute the q-function at each quadrature point + RAJA_TEAM_SHARED qf_outputs_type qf_outputs; + batch_apply_chain_rule(qf_derivatives + e * nqp, qf_inputs[e], qf_outputs, ctx); + ctx.teamSync(); + + // (batch) integrate the material response against the test-space basis functions + test_element::integrate(qf_outputs, rule, &dr[elements[e]], ctx); + ctx.teamSync(); + }); + }); + rm.deallocate(qf_inputs); } /** @@ -297,65 +369,86 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q * @see mfem::GeometricFactors * @param[in] num_elements The number of elements in the mesh */ -template -void element_gradient_kernel(ExecArrayView dK, derivatives_type* qf_derivatives, - const int* elements, std::size_t num_elements) +template +void element_gradient_kernel(ExecArrayView dK, derivatives_type* qf_derivatives, const int* elements, + std::size_t num_elements) { - using test_element = finite_element; - using trial_element = finite_element; + using test_element = finite_element; + using trial_element = finite_element; - constexpr int nquad = num_quadrature_points(g, Q); + std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST"; + constexpr bool is_QOI = test::family == Family::QOI; + using padded_derivative_type [[maybe_unused]] = + std::conditional_t, derivatives_type>; - static constexpr TensorProductQuadratureRule rule{}; + RAJA::TypedRangeSegment elements_range(0, num_elements); + constexpr int nquad = num_quadrature_points(g, Q); + constexpr TensorProductQuadratureRule rule{}; // for each element in the domain - for (uint32_t e = 0; e < num_elements; e++) { - auto* output_ptr = reinterpret_cast(&dK(elements[e], 0, 0)); - - tensor derivatives{}; - for (int q = 0; q < nquad; q++) { - derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)]; - } - - for (int J = 0; J < trial_element::ndof; J++) { - auto source_and_flux = trial_element::batch_apply_shape_fn(J, derivatives, rule); - test_element::integrate(source_and_flux, rule, output_ptr + J, trial_element::ndof); - } - } + RAJA::launch::launch_t>( + RAJA::LaunchParams(RAJA::Teams(static_cast(num_elements)), RAJA::Threads(BLOCK_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop::teams_t>( + ctx, elements_range, [&ctx, dK, elements, qf_derivatives, nquad, rule](uint32_t e) { + (void)nquad; + auto* output_ptr = reinterpret_cast(&dK(elements[e], 0, 0)); + + RAJA_TEAM_SHARED tensor derivatives; + RAJA::RangeSegment x_range(0, nquad); + RAJA::loop::threads_t>( + ctx, x_range, [&](int q) { derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)]; }); + + ctx.teamSync(); + + RAJA_TEAM_SHARED + typename trial_element::template batch_apply_shape_fn_output::type source_and_flux; + for (int J = 0; J < trial_element::ndof; J++) { + trial_element::batch_apply_shape_fn(J, derivatives, &source_and_flux, rule, ctx); + ctx.teamSync(); + + test_element::integrate(source_and_flux, rule, output_ptr + J, ctx, trial_element::ndof); + ctx.teamSync(); + } + }); + }); } -template auto evaluation_kernel(signature s, lambda_type qf, const double* positions, const double* jacobians, std::shared_ptr qf_derivatives, const int* elements, uint32_t num_elements) { - auto trial_elements = trial_elements_tuple(s); - auto test_element = get_test_element(s); + auto trial_elements = trial_elements_tuple(s); + auto test_element = get_test_element(s); return [=](double time, const std::vector& inputs, double* outputs, bool /* update state */) { - evaluation_kernel_impl(trial_elements, test_element, time, inputs, outputs, positions, jacobians, qf, - qf_derivatives.get(), elements, num_elements, s.index_seq); + evaluation_kernel_impl(trial_elements, test_element, time, inputs, outputs, positions, + jacobians, qf, qf_derivatives.get(), elements, num_elements, + s.index_seq); }; } -template +template std::function jacobian_vector_product_kernel( signature, std::shared_ptr qf_derivatives, const int* elements, uint32_t num_elements) { return [=](const double* du, double* dr) { using test_space = typename signature::return_type; using trial_space = typename std::tuple_element::type; - action_of_gradient_kernel(du, dr, qf_derivatives.get(), elements, num_elements); + action_of_gradient_kernel(du, dr, qf_derivatives.get(), elements, + num_elements); }; } -template -std::function)> element_gradient_kernel( +template +std::function)> element_gradient_kernel( signature, std::shared_ptr qf_derivatives, const int* elements, uint32_t num_elements) { - return [=](ExecArrayView K_elem) { + return [=](ExecArrayView K_elem) { using test_space = typename signature::return_type; using trial_space = typename std::tuple_element::type; - element_gradient_kernel(K_elem, qf_derivatives.get(), elements, num_elements); + element_gradient_kernel(K_elem, qf_derivatives.get(), elements, + num_elements); }; } diff --git a/src/serac/numerics/functional/detail/hexahedron_H1.inl b/src/serac/numerics/functional/detail/hexahedron_H1.inl index c28b15fa0..10a538368 100644 --- a/src/serac/numerics/functional/detail/hexahedron_H1.inl +++ b/src/serac/numerics/functional/detail/hexahedron_H1.inl @@ -10,14 +10,18 @@ * @brief Specialization of finite_element for H1 on hexahedron geometry */ +#include "RAJA/RAJA.hpp" +#include "serac/infrastructure/accelerator.hpp" +#include "serac/numerics/functional/tensor.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // // note: mfem assumes the parent element domain is [0,1]x[0,1]x[0,1] // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::CUBE; static constexpr auto family = Family::H1; static constexpr int components = c; @@ -31,11 +35,18 @@ struct finite_element > { using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); + using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); + using type = tensor, q * q * q>; + }; + SERAC_HOST_DEVICE static constexpr tensor shape_functions(tensor xi) { auto N_xi = GaussLobattoInterpolation

(xi[0]); @@ -72,7 +83,7 @@ struct finite_element > { for (int j = 0; j < p + 1; j++) { for (int i = 0; i < p + 1; i++) { dN[count++] = { - dN_xi[i] * N_eta[j] * N_zeta[k], + dN_xi[i] * N_eta[j] * N_zeta[k], N_xi[i] * dN_eta[j] * N_zeta[k], N_xi[i] * N_eta[j] * dN_zeta[k] }; @@ -130,7 +141,9 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext ctx) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); @@ -140,34 +153,34 @@ struct finite_element > { int jy = (j % (n * n)) / n; int jz = j / (n * n); - using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); - using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); - - tensor, q * q * q> output; + auto x_range = RAJA::RangeSegment(0, q * q * q); + RAJA::loop::threads_t>(ctx, x_range, [&](int Q) { + int qx = Q % q; + int qy = (Q / q) % q; + int qz = (Q / (q * q)); - for (int qz = 0; qz < q; qz++) { - for (int qy = 0; qy < q; qy++) { - for (int qx = 0; qx < q; qx++) { - double phi_j = B(qx, jx) * B(qy, jy) * B(qz, jz); - tensor dphi_j_dxi = {G(qx, jx) * B(qy, jy) * B(qz, jz), B(qx, jx) * G(qy, jy) * B(qz, jz), - B(qx, jx) * B(qy, jy) * G(qz, jz)}; + double phi_j = B(qx, jx) * B(qy, jy) * B(qz, jz); + tensor dphi_j_dxi = {G(qx, jx) * B(qy, jy) * B(qz, jz), B(qx, jx) * G(qy, jy) * B(qz, jz), + B(qx, jx) * B(qy, jy) * G(qz, jz)}; - int Q = (qz * q + qy) * q + qx; - auto& d00 = get<0>(get<0>(input(Q))); - auto& d01 = get<1>(get<0>(input(Q))); - auto& d10 = get<0>(get<1>(input(Q))); - auto& d11 = get<1>(get<1>(input(Q))); + auto& d00 = get<0>(get<0>(input(Q))); + auto& d01 = get<1>(get<0>(input(Q))); + auto& d10 = get<0>(get<1>(input(Q))); + auto& d11 = get<1>(get<1>(input(Q))); - output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; - } - } - } + (*output)[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; + }); + } - return output; + template + static auto interpolate_output_helper() + { + return tensor{}; } template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static void interpolate(const dof_type& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext ctx) { // we want to compute the following: // @@ -176,7 +189,7 @@ struct finite_element > { // where // X_q(u, v, w) are the quadrature-point values at position {u, v, w}, // B(u, i) is the i^{th} 1D interpolation/differentiation (shape) function, - // evaluated at the u^{th} 1D quadrature point, and + // evaluated at the u^{th} 1D quadrature point, andintegrate // X_e(i, j, k) are the values at node {i, j, k} to be interpolated // // this algorithm carries out the above calculation in 3 steps: @@ -184,97 +197,224 @@ struct finite_element > { // A1(dz, dy, qx) := B(qx, dx) * X_e(dz, dy, dx) // A2(dz, qy, qx) := B(qy, dy) * A1(dz, dy, qx) // X_q(qz, qy, qx) := B(qz, dz) * A2(dz, qy, qx) + using threads_t = typename EvaluationSpacePolicy::threads_t; static constexpr bool apply_weights = false; - static constexpr auto B = calculate_B(); - static constexpr auto G = calculate_G(); - - tensor value{}; - tensor gradient{}; + RAJA::RangeSegment x_range(0, BLOCK_SZ); + RAJA_TEAM_SHARED tensor value; + RAJA_TEAM_SHARED tensor gradient; + constexpr auto B = calculate_B(); + constexpr auto G = calculate_G(); + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + + memset_tensor(value, 0.0, qx, qy, qz); + memset_tensor(gradient, 0.0, qx, qy, qz); + }); + ctx.teamSync(); for (int i = 0; i < c; i++) { - auto A10 = contract<2, 1>(X[i], B); - auto A11 = contract<2, 1>(X[i], G); - - auto A20 = contract<1, 1>(A10, B); - auto A21 = contract<1, 1>(A11, B); - auto A22 = contract<1, 1>(A10, G); - - value(i) = contract<0, 1>(A20, B); - gradient(i, 0) = contract<0, 1>(A21, B); - gradient(i, 1) = contract<0, 1>(A22, B); - gradient(i, 2) = contract<0, 1>(A20, G); + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 1>(X[i], B)) A10; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 1>(X[i], G)) A11; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A10, B)) A20; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A11, B)) A21; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 1>(A10, G)) A22; + + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + memset_tensor(A20, 0.0, qx, qy, qz); + memset_tensor(A21, 0.0, qx, qy, qz); + memset_tensor(A22, 0.0, qx, qy, qz); + memset_tensor(A10, 0.0, qx, qy, qz); + memset_tensor(A11, 0.0, qx, qy, qz); + ctx.teamSync(); + }); + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + // Perform actual contractions + contract<2, 1>(X[i], B, &A10, qx, qy, qz); + contract<2, 1>(X[i], G, &A11, qx, qy, qz); + ctx.teamSync(); + }); + + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + + contract<1, 1>(A10, B, &A20, qx, qy, qz); + contract<1, 1>(A11, B, &A21, qx, qy, qz); + contract<1, 1>(A10, G, &A22, qx, qy, qz); + ctx.teamSync(); + }); + + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + + contract<0, 1>(A20, B, &value(i), qx, qy, qz); + contract<0, 1>(A21, B, &gradient(i, 0), qx, qy, qz); + contract<0, 1>(A22, B, &gradient(i, 1), qx, qy, qz); + contract<0, 1>(A20, G, &gradient(i, 2), qx, qy, qz); + ctx.teamSync(); + }); } // transpose the quadrature data into a flat tensor of tuples + + RAJA_TEAM_SHARED union { - tensor one_dimensional; - tensor, tensor >, q, q, q> three_dimensional; + tensor one_dimensional; + tensor, tensor>, q, q, q> three_dimensional; } output; - for (int qz = 0; qz < q; qz++) { - for (int qy = 0; qy < q; qy++) { - for (int qx = 0; qx < q; qx++) { - for (int i = 0; i < c; i++) { - get(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx); - for (int j = 0; j < dim; j++) { - get(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx); - } - } + RAJA::loop(ctx, x_range, [&](int tid) { + if (tid >= q * q * q) { + return; + } + int qx = tid % q; + int qy = (tid / q) % q; + int qz = tid / (q * q); + + for (int i = 0; i < c; i++) { + get(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx); + for (int j = 0; j < dim; j++) { + get(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx); } } + }); + ctx.teamSync(); + if (output_ptr) { + RAJA::loop(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.one_dimensional)) { + ((*output_ptr))[tid] = output.one_dimensional[tid]; + } + }); } - - return output.one_dimensional; } template SERAC_HOST_DEVICE static void integrate(const tensor, q * q * q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - int step = 1) + RAJA::LaunchContext ctx, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; } constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; + using threads_t = typename EvaluationSpacePolicy::threads_t; + using s_buffer_type = std::conditional_t{}, zero, tensor>; + using f_buffer_type = std::conditional_t{}, zero, tensor>; - using s_buffer_type = std::conditional_t{}, zero, tensor >; - using f_buffer_type = std::conditional_t{}, zero, tensor >; + /*static*/ constexpr bool apply_weights = true; - static constexpr bool apply_weights = true; - static constexpr auto B = calculate_B(); - static constexpr auto G = calculate_G(); + RAJA::RangeSegment x_range(0, BLOCK_SZ); + + constexpr auto B = calculate_B(); + constexpr auto G = calculate_G(); + RAJA_TEAM_SHARED s_buffer_type source; + RAJA_TEAM_SHARED f_buffer_type flux; + + // The binary is_zero_and type will select which of its binary operands is nonzero, and return it + using t1 = decltype(deduce_contract_return_type<2, 0>(source, B)); + using t2 = decltype(deduce_contract_return_type<2, 0>(flux(0), G)); + + RAJA_TEAM_SHARED typename is_zero_and::type A20; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(flux(1), B)) A21; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<2, 0>(flux(2), B)) A22; + + using t3 = decltype(deduce_contract_return_type<1, 0>(A20, B)); + using t4 = decltype(deduce_contract_return_type<1, 0>(A21, G)); + RAJA_TEAM_SHARED typename is_zero_and::type A10; + RAJA_TEAM_SHARED decltype(deduce_contract_return_type<1, 0>(A22, B)) A11; for (int j = 0; j < ntrial; j++) { for (int i = 0; i < c; i++) { - s_buffer_type source; - f_buffer_type flux; - - for (int qz = 0; qz < q; qz++) { - for (int qy = 0; qy < q; qy++) { - for (int qx = 0; qx < q; qx++) { - int Q = (qz * q + qy) * q + qx; - if constexpr (!is_zero{}) { - source(qz, qy, qx) = reinterpret_cast(&get(qf_output[Q]))[i * ntrial + j]; - } - if constexpr (!is_zero{}) { - for (int k = 0; k < dim; k++) { - flux(k, qz, qy, qx) = - reinterpret_cast(&get(qf_output[Q]))[(i * dim + k) * ntrial + j]; - } - } - } + ctx.teamSync(); + RAJA::loop(ctx, x_range, [&](int tid) { + if (tid >= q * q * q) { + return; + } + int qx = tid % q; + int qy = (tid / q) % q; + int qz = tid / (q * q); + if constexpr (!is_zero{}) { + source(qz, qy, qx) = reinterpret_cast(&get(qf_output[tid]))[i * ntrial + j]; } + }); + for (int k = 0; k < dim; k++) { + ctx.teamSync(); + RAJA::loop(ctx, x_range, [&](int tid) { + if (tid >= q * q * q) { + return; + } + int qx = tid % q; + int qy = (tid / q) % q; + int qz = tid / (q * q); + int Q = (qz * q + qy) * q + qx; + if constexpr (!is_zero{}) { + flux(k, qz, qy, qx) = + reinterpret_cast(&get(qf_output[Q]))[(i * dim + k) * ntrial + j]; + } + }); } - - auto A20 = contract<2, 0>(source, B) + contract<2, 0>(flux(0), G); - auto A21 = contract<2, 0>(flux(1), B); - auto A22 = contract<2, 0>(flux(2), B); - - auto A10 = contract<1, 0>(A20, B) + contract<1, 0>(A21, G); - auto A11 = contract<1, 0>(A22, B); - - element_residual[j * step](i) += contract<0, 0>(A10, B) + contract<0, 0>(A11, G); + ctx.teamSync(); + + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + + memset_tensor(A20, 0.0, qx, qy, qz); + memset_tensor(A21, 0.0, qx, qy, qz); + memset_tensor(A22, 0.0, qx, qy, qz); + memset_tensor(A10, 0.0, qx, qy, qz); + memset_tensor(A11, 0.0, qx, qy, qz); + ctx.teamSync(); + }); + + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + contract<2, 0>(source, B, &A20, qx, qy, qz); + ctx.teamSync(); + contract<2, 0>(flux(0), G, &A20, qx, qy, qz, true); + ctx.teamSync(); + + contract<2, 0>(flux(1), B, &A21, qx, qy, qz); + contract<2, 0>(flux(2), B, &A22, qx, qy, qz); + ctx.teamSync(); + }); + + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + contract<1, 0>(A21, G, &A10, qx, qy, qz); + ctx.teamSync(); + contract<1, 0>(A20, B, &A10, qx, qy, qz, true); + ctx.teamSync(); + contract<1, 0>(A22, B, &A11, qx, qy, qz); + ctx.teamSync(); + }); + + RAJA::loop(ctx, x_range, [&](int tid) { + int qx = tid % BLOCK_X; + int qy = (tid / BLOCK_X) % BLOCK_Y; + int qz = tid / (BLOCK_X * BLOCK_Y); + contract<0, 0>(A10, B, &(element_residual[j * step](i)), qx, qy, qz, true); + ctx.teamSync(); + contract<0, 0>(A11, G, &(element_residual[j * step](i)), qx, qy, qz, true); + ctx.teamSync(); + }); } } } @@ -282,7 +422,7 @@ struct finite_element > { #if 0 template - static SERAC_DEVICE auto interpolate(const dof_type& X, const tensor& J, + static SERAC_DEVICE void interpolate(const dof_type& X, const tensor& J, const TensorProductQuadratureRule& rule, cache_type& cache) { // we want to compute the following: diff --git a/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl b/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl index 36fdfd3d2..a4c3a8859 100644 --- a/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl +++ b/src/serac/numerics/functional/detail/hexahedron_Hcurl.inl @@ -10,6 +10,8 @@ * @brief Specialization of finite_element for Hcurl on hexahedron geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their curls) that // interpolate at Gauss-Lobatto nodes for closed intervals, and Gauss-Legendre // nodes for open intervals. @@ -19,8 +21,8 @@ // see quadrilateral_hcurl.inl for more information // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element> { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::CUBE; static constexpr auto family = Family::HCURL; static constexpr int dim = 3; @@ -35,6 +37,14 @@ struct finite_element> { using residual_type = typename std::conditional, tensor>::type; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(dot(get<0>(get<0>(in_t{})), vec3{}) + dot(get<1>(get<0>(in_t{})), vec3{})); + using flux_t = decltype(dot(get<0>(get<1>(in_t{})), vec3{}) + dot(get<1>(get<1>(in_t{})), vec3{})); + + using type = tensor, q * q * q>; + }; + // this is how mfem provides the data to us for these elements struct dof_type { tensor x; @@ -255,7 +265,9 @@ struct finite_element> { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { constexpr bool apply_weights = false; constexpr tensor B1 = calculate_B1(); @@ -287,11 +299,6 @@ struct finite_element> { break; } - using source_t = decltype(dot(get<0>(get<0>(in_t{})), vec3{}) + dot(get<1>(get<0>(in_t{})), vec3{})); - using flux_t = decltype(dot(get<0>(get<1>(in_t{})), vec3{}) + dot(get<1>(get<1>(in_t{})), vec3{})); - - tensor, q * q * q> output; - for (int qz = 0; qz < q; qz++) { for (int qy = 0; qy < q; qy++) { for (int qx = 0; qx < q; qx++) { @@ -333,7 +340,15 @@ struct finite_element> { } template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule&) + static auto interpolate_output_helper() + { + return tensor, tensor>, q * q * q>{}; + } + + template + SERAC_HOST_DEVICE static void interpolate(const dof_type& element_values, const TensorProductQuadratureRule&, + tensor, tensor>, q * q * q>* output_ptr, + RAJA::LaunchContext) { constexpr bool apply_weights = false; constexpr tensor B1 = calculate_B1(); @@ -375,28 +390,24 @@ struct finite_element> { } // clang-format on - tensor, tensor>, q * q * q> qf_inputs; - int count = 0; for (int qz = 0; qz < q; qz++) { for (int qy = 0; qy < q; qy++) { for (int qx = 0; qx < q; qx++) { for (int i = 0; i < 3; i++) { - get(qf_inputs(count))[i] = value[i](qz, qy, qx); - get(qf_inputs(count))[i] = curl[i](qz, qy, qx); + get((*output_ptr)(count))[i] = value[i](qz, qy, qx); + get((*output_ptr)(count))[i] = curl[i](qz, qy, qx); } count++; } } } - - return qf_inputs; } template SERAC_HOST_DEVICE static void integrate(const tensor, q * q * q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - [[maybe_unused]] int step = 1) + RAJA::LaunchContext, [[maybe_unused]] int step = 1) { constexpr bool apply_weights = true; constexpr tensor B1 = calculate_B1(); @@ -444,8 +455,8 @@ struct finite_element> { element_residual[0].y += contract< y, 0 >(A1, B1); } - // r(2, dz, dy, dx) = s(2, qz, qy, qx) * B1(qz, dz) * B2(qy, dy) * B2(qx, dx) - // + f(0, qz, qy, qx) * B1(qz, dz) * G2(qy, dy) * B2(qx, dx) + // r(2, dz, dy, dx) = s(2, qz, qy, qx) * B1(qz, dz) * B2(qy, dy) * B2(qx, dx) + // + f(0, qz, qy, qx) * B1(qz, dz) * G2(qy, dy) * B2(qx, dx) // - f(1, qz, qy, qx) * B1(qz, dz) * B2(qy, dy) * G2(qx, dx); { auto A20 = contract< y, 0 >(source[2], B2) + contract< y, 0 >(flux[0], G2); @@ -461,7 +472,7 @@ struct finite_element> { #if 0 template - static SERAC_DEVICE auto interpolate(const dof_type& element_values, const tensor& J, + static SERAC_DEVICE void interpolate(const dof_type& element_values, const tensor& J, const TensorProductQuadratureRule& rule, cache_type& cache) { int tidx = threadIdx.x % q; diff --git a/src/serac/numerics/functional/detail/hexahedron_L2.inl b/src/serac/numerics/functional/detail/hexahedron_L2.inl index 4a75d77b3..7a51f6f7c 100644 --- a/src/serac/numerics/functional/detail/hexahedron_L2.inl +++ b/src/serac/numerics/functional/detail/hexahedron_L2.inl @@ -10,14 +10,16 @@ * @brief Specialization of finite_element for L2 on hexahedron geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // // note: mfem assumes the parent element domain is [0,1]x[0,1]x[0,1] // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::CUBE; static constexpr auto family = Family::L2; static constexpr int components = c; @@ -31,15 +33,23 @@ struct finite_element > { // TODO: remove this using residual_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); + using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); + + using type = tensor, q * q * q>; + }; + SERAC_HOST_DEVICE static constexpr tensor shape_functions(tensor xi) { auto N_xi = GaussLobattoInterpolation

(xi[0]); @@ -76,7 +86,7 @@ struct finite_element > { for (int j = 0; j < p + 1; j++) { for (int i = 0; i < p + 1; i++) { dN[count++] = { - dN_xi[i] * N_eta[j] * N_zeta[k], + dN_xi[i] * N_eta[j] * N_zeta[k], N_xi[i] * dN_eta[j] * N_zeta[k], N_xi[i] * N_eta[j] * dN_zeta[k] }; @@ -134,7 +144,9 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); @@ -144,11 +156,6 @@ struct finite_element > { int jy = (j % (n * n)) / n; int jz = j / (n * n); - using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); - using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); - - tensor, q * q * q> output; - for (int qz = 0; qz < q; qz++) { for (int qy = 0; qy < q; qy++) { for (int qx = 0; qx < q; qx++) { @@ -162,16 +169,21 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(Q))); auto& d11 = get<1>(get<1>(input(Q))); - output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; + (*output)[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; } } } + } - return output; + template + static auto interpolate_output_helper() + { + return tensor{}; } template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static void interpolate(const dof_type& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext ctx) { // we want to compute the following: // @@ -211,8 +223,8 @@ struct finite_element > { // transpose the quadrature data into a flat tensor of tuples union { - tensor one_dimensional; - tensor, tensor >, q, q, q> three_dimensional; + tensor one_dimensional; + tensor, tensor>, q, q, q> three_dimensional; } output; for (int qz = 0; qz < q; qz++) { @@ -228,13 +240,20 @@ struct finite_element > { } } - return output.one_dimensional; + RAJA::TypedRangeSegment x_range(0, BLOCK_SZ); + if (output_ptr) { + RAJA::loop>(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.one_dimensional)) { + ((*output_ptr))[tid] = output.one_dimensional[tid]; + } + }); + } } template SERAC_HOST_DEVICE static void integrate(const tensor, q * q * q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - int step = 1) + RAJA::LaunchContext, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; @@ -242,8 +261,8 @@ struct finite_element > { constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; - using s_buffer_type = std::conditional_t{}, zero, tensor >; - using f_buffer_type = std::conditional_t{}, zero, tensor >; + using s_buffer_type = std::conditional_t{}, zero, tensor>; + using f_buffer_type = std::conditional_t{}, zero, tensor>; static constexpr bool apply_weights = true; static constexpr auto B = calculate_B(); @@ -286,7 +305,7 @@ struct finite_element > { #if 0 template - static SERAC_DEVICE auto interpolate(const dof_type& X, const tensor& J, + static SERAC_DEVICE void interpolate(const dof_type& X, const tensor& J, const TensorProductQuadratureRule& rule, cache_type& cache) { // we want to compute the following: diff --git a/src/serac/numerics/functional/detail/qoi.inl b/src/serac/numerics/functional/detail/qoi.inl index 33e747ea4..3b0fb9510 100644 --- a/src/serac/numerics/functional/detail/qoi.inl +++ b/src/serac/numerics/functional/detail/qoi.inl @@ -9,9 +9,13 @@ * * @brief Specialization of finite_element for expressing quantities of interest on any geometry */ + +#include "RAJA/RAJA.hpp" +#include "serac/infrastructure/accelerator.hpp" + /// @cond -template -struct finite_element { +template +struct finite_element { static constexpr auto geometry = g; static constexpr auto family = Family::QOI; static constexpr int components = 1; @@ -25,14 +29,14 @@ struct finite_element { template SERAC_HOST_DEVICE static void integrate(const tensor&, const TensorProductQuadratureRule&, dof_type*, - [[maybe_unused]] int step = 1) + RAJA::LaunchContext, [[maybe_unused]] int step = 1) { return; // integrating zeros is a no-op } template SERAC_HOST_DEVICE static void integrate(const tensor& qf_output, const TensorProductQuadratureRule&, - dof_type* element_total, [[maybe_unused]] int step = 1) + dof_type* element_total, RAJA::LaunchContext, [[maybe_unused]] int step = 1) { if constexpr (geometry == mfem::Geometry::SEGMENT) { static_assert(Q == q); @@ -79,7 +83,7 @@ struct finite_element { template SERAC_HOST_DEVICE static void integrate(const tensor, Q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_total, - [[maybe_unused]] int step = 1) + RAJA::LaunchContext, [[maybe_unused]] int step = 1) { if constexpr (is_zero{}) { return; diff --git a/src/serac/numerics/functional/detail/quadrilateral_H1.inl b/src/serac/numerics/functional/detail/quadrilateral_H1.inl index 312f8e9c7..c9ed1390f 100644 --- a/src/serac/numerics/functional/detail/quadrilateral_H1.inl +++ b/src/serac/numerics/functional/detail/quadrilateral_H1.inl @@ -10,14 +10,17 @@ * @brief Specialization of finite_element for H1 on quadrilateral geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // // note: mfem assumes the parent element domain is [0,1]x[0,1] // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element > { + +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::SQUARE; static constexpr auto family = Family::H1; static constexpr int components = c; @@ -30,15 +33,23 @@ struct finite_element > { static constexpr int SOURCE = 0, FLUX = 1; using residual_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); + using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); + + using type = tensor, q * q>; + }; + /* interpolation nodes and their associated numbering: @@ -155,7 +166,9 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); @@ -164,11 +177,6 @@ struct finite_element > { int jx = j % n; int jy = j / n; - using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); - using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); - - tensor, q * q> output; - for (int qy = 0; qy < q; qy++) { for (int qx = 0; qx < q; qx++) { double phi_j = B(qx, jx) * B(qy, jy); @@ -180,11 +188,15 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(Q))); auto& d11 = get<1>(get<1>(input(Q))); - output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; + (*output)[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; } } + } - return output; + template + static auto interpolate_output_helper() + { + return tensor{}; } // we want to compute the following: @@ -202,7 +214,8 @@ struct finite_element > { // A(dy, qx) := B(qx, dx) * X_e(dy, dx) // X_q(qy, qx) := B(qy, dy) * A(dy, qx) template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static void interpolate(const dof_type& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext ctx) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); @@ -223,8 +236,8 @@ struct finite_element > { // transpose the quadrature data into a flat tensor of tuples union { - tensor one_dimensional; - tensor, tensor >, q, q> two_dimensional; + tensor one_dimensional; + tensor, tensor>, q, q> two_dimensional; } output; for (int qy = 0; qy < q; qy++) { @@ -237,8 +250,14 @@ struct finite_element > { } } } - - return output.one_dimensional; + RAJA::TypedRangeSegment x_range(0, BLOCK_SZ); + if (output_ptr) { + RAJA::loop>(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.one_dimensional)) { + ((*output_ptr))[tid] = output.one_dimensional[tid]; + } + }); + } } // source can be one of: {zero, double, tensor, tensor} @@ -247,7 +266,7 @@ struct finite_element > { template SERAC_HOST_DEVICE static void integrate(const tensor, q * q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - int step = 1) + RAJA::LaunchContext, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; @@ -255,8 +274,8 @@ struct finite_element > { constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; - using s_buffer_type = std::conditional_t{}, zero, tensor >; - using f_buffer_type = std::conditional_t{}, zero, tensor >; + using s_buffer_type = std::conditional_t{}, zero, tensor>; + using f_buffer_type = std::conditional_t{}, zero, tensor>; static constexpr bool apply_weights = true; static constexpr auto B = calculate_B(); @@ -293,7 +312,7 @@ struct finite_element > { #if 0 template - static SERAC_DEVICE auto interpolate(const dof_type& X, const tensor& J, + static SERAC_DEVICE void interpolate(const dof_type& X, const tensor& J, const TensorProductQuadratureRule& rule, cache_type& A) { int tidx = threadIdx.x % q; diff --git a/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl b/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl index 815b5b72e..4ff075cb6 100644 --- a/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl +++ b/src/serac/numerics/functional/detail/quadrilateral_Hcurl.inl @@ -10,6 +10,8 @@ * @brief Specialization of finite_element for Hcurl on quadrilateral geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their curls) that // interpolate at Gauss-Lobatto nodes for closed intervals, and Gauss-Legendre // nodes for open intervals. @@ -21,8 +23,8 @@ // rather than 3D vector along the z-direction. // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::SQUARE; static constexpr auto family = Family::HCURL; static constexpr int dim = 2; @@ -36,6 +38,14 @@ struct finite_element > { using residual_type = typename std::conditional, tensor >::type; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(dot(get<0>(get<0>(in_t{})), tensor{}) + get<1>(get<0>(in_t{})) * double{}); + using flux_t = decltype(dot(get<0>(get<1>(in_t{})), tensor{}) + get<1>(get<1>(in_t{})) * double{}); + + using type = tensor, q * q>; + }; + // this is how mfem provides the data to us for these elements // if, instead, it was stored as simply tensor< double, 2, p + 1, p >, // the interpolation/integrate implementation would be considerably shorter @@ -243,7 +253,9 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { constexpr bool apply_weights = false; constexpr tensor B1 = calculate_B1(); @@ -260,11 +272,6 @@ struct finite_element > { jy = (j % ((p + 1) * p)) / n; } - using source_t = decltype(dot(get<0>(get<0>(in_t{})), tensor{}) + get<1>(get<0>(in_t{})) * double{}); - using flux_t = decltype(dot(get<0>(get<1>(in_t{})), tensor{}) + get<1>(get<1>(in_t{})) * double{}); - - tensor, q * q> output; - for (int qy = 0; qy < q; qy++) { for (int qx = 0; qx < q; qx++) { tensor phi_j{(dir == 0) * B1(qx, jx) * B2(qy, jy), (dir == 1) * B1(qy, jy) * B2(qx, jx)}; @@ -277,15 +284,21 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(Q))); auto& d11 = get<1>(get<1>(input(Q))); - output[Q] = {dot(d00, phi_j) + d01 * curl_phi_j, dot(d10, phi_j) + d11 * curl_phi_j}; + (*output)[Q] = {dot(d00, phi_j) + d01 * curl_phi_j, dot(d10, phi_j) + d11 * curl_phi_j}; } } - - return output; } template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule&) + static auto interpolate_output_helper() + { + return tensor, double>, q * q>{}; + }; + + template + SERAC_HOST_DEVICE static void interpolate(const dof_type& element_values, const TensorProductQuadratureRule&, + tensor, double>, q * q>* output_ptr, + RAJA::LaunchContext) { constexpr bool apply_weights = false; constexpr tensor B1 = calculate_B1(); @@ -312,20 +325,18 @@ struct finite_element > { for (int qy = 0; qy < q; qy++) { for (int qx = 0; qx < q; qx++) { for (int i = 0; i < dim; i++) { - get(qf_inputs(count))[i] = value[i](qy, qx); + get((*output_ptr)(count))[i] = value[i](qy, qx); } - get(qf_inputs(count)) = curl(qy, qx); + get((*output_ptr)(count)) = curl(qy, qx); count++; } } - - return qf_inputs; } template SERAC_HOST_DEVICE static void integrate(const tensor, q * q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - [[maybe_unused]] int step = 1) + RAJA::LaunchContext, [[maybe_unused]] int step = 1) { constexpr bool apply_weights = true; constexpr tensor B1 = calculate_B1(); @@ -359,7 +370,7 @@ struct finite_element > { #if 0 template - static SERAC_DEVICE auto interpolate(const dof_type& element_values, const tensor& J, + static SERAC_DEVICE void interpolate(const dof_type& element_values, const tensor& J, const TensorProductQuadratureRule& rule, cache_type& A) { int tidx = threadIdx.x % q; diff --git a/src/serac/numerics/functional/detail/quadrilateral_L2.inl b/src/serac/numerics/functional/detail/quadrilateral_L2.inl index 48ba045f0..68eeb15ae 100644 --- a/src/serac/numerics/functional/detail/quadrilateral_L2.inl +++ b/src/serac/numerics/functional/detail/quadrilateral_L2.inl @@ -10,14 +10,16 @@ * @brief Specialization of finite_element for L2 on quadrilateral geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // // note: mfem assumes the parent element domain is [0,1]x[0,1] // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::SQUARE; static constexpr auto family = Family::L2; static constexpr int components = c; @@ -29,15 +31,23 @@ struct finite_element > { static constexpr int SOURCE = 0, FLUX = 1; using residual_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); + using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); + + using type = tensor, q * q>; + }; + /* interpolation nodes and their associated numbering: @@ -154,7 +164,9 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); @@ -163,11 +175,6 @@ struct finite_element > { int jx = j % n; int jy = j / n; - using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); - using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); - - tensor, q * q> output; - for (int qy = 0; qy < q; qy++) { for (int qx = 0; qx < q; qx++) { double phi_j = B(qx, jx) * B(qy, jy); @@ -179,11 +186,15 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(Q))); auto& d11 = get<1>(get<1>(input(Q))); - output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; + (*output)[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; } } + } - return output; + template + static auto interpolate_output_helper() + { + return tensor{}; } // we want to compute the following: @@ -201,7 +212,8 @@ struct finite_element > { // A(dy, qx) := B(qx, dx) * X_e(dy, dx) // X_q(qy, qx) := B(qy, dy) * A(dy, qx) template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static void interpolate(const dof_type& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext ctx) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); @@ -222,8 +234,8 @@ struct finite_element > { // transpose the quadrature data into a flat tensor of tuples union { - tensor one_dimensional; - tensor, tensor >, q, q> two_dimensional; + tensor one_dimensional; + tensor, tensor>, q, q> two_dimensional; } output; for (int qy = 0; qy < q; qy++) { @@ -237,7 +249,14 @@ struct finite_element > { } } - return output.one_dimensional; + RAJA::TypedRangeSegment x_range(0, BLOCK_SZ); + if (output_ptr) { + RAJA::loop>(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.one_dimensional)) { + ((*output_ptr))[tid] = output.one_dimensional[tid]; + } + }); + } } // source can be one of: {zero, double, tensor, tensor} @@ -246,7 +265,7 @@ struct finite_element > { template SERAC_HOST_DEVICE static void integrate(const tensor, q * q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - int step = 1) + RAJA::LaunchContext, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; @@ -254,8 +273,8 @@ struct finite_element > { constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; - using s_buffer_type = std::conditional_t{}, zero, tensor >; - using f_buffer_type = std::conditional_t{}, zero, tensor >; + using s_buffer_type = std::conditional_t{}, zero, tensor>; + using f_buffer_type = std::conditional_t{}, zero, tensor>; static constexpr bool apply_weights = true; static constexpr auto B = calculate_B(); @@ -291,7 +310,7 @@ struct finite_element > { #if 0 template - static SERAC_DEVICE auto interpolate(const dof_type& X, const tensor& J, + static SERAC_DEVICE void interpolate(const dof_type& X, const tensor& J, const TensorProductQuadratureRule& rule, cache_type& A) { int tidx = threadIdx.x % q; @@ -352,8 +371,6 @@ struct finite_element > { } get<1>(qf_input) = dot(get<1>(qf_input), inv(J)); - - return qf_input; } template diff --git a/src/serac/numerics/functional/detail/segment_H1.inl b/src/serac/numerics/functional/detail/segment_H1.inl index 11fe30bc2..3a9430516 100644 --- a/src/serac/numerics/functional/detail/segment_H1.inl +++ b/src/serac/numerics/functional/detail/segment_H1.inl @@ -10,14 +10,16 @@ * @brief Specialization of finite_element for H1 on segment geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // // note: mfem assumes the parent element domain is [0,1] // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::SEGMENT; static constexpr auto family = Family::H1; static constexpr int components = c; @@ -37,6 +39,14 @@ struct finite_element > { using residual_type = typename std::conditional, tensor >::type; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(T{})) + get<1>(get<0>(T{}))); + using flux_t = decltype(get<0>(get<1>(T{})) + get<1>(get<1>(T{}))); + + using type = tensor, q>; + }; + SERAC_HOST_DEVICE static constexpr tensor shape_functions(double xi) { return GaussLobattoInterpolation(xi); @@ -94,17 +104,14 @@ struct finite_element > { } template - SERAC_HOST_DEVICE static auto batch_apply_shape_fn(int jx, tensor input, const TensorProductQuadratureRule&) + RAJA_HOST_DEVICE static auto RAJA_HOST_DEVICE + batch_apply_shape_fn(int jx, tensor input, typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); static constexpr auto G = calculate_G(); - using source_t = decltype(get<0>(get<0>(T{})) + get<1>(get<0>(T{}))); - using flux_t = decltype(get<0>(get<1>(T{})) + get<1>(get<1>(T{}))); - - tensor, q> output; - for (int qx = 0; qx < q; qx++) { double phi_j = B(qx, jx); double dphi_j_dxi = G(qx, jx); @@ -114,14 +121,19 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(qx))); auto& d11 = get<1>(get<1>(input(qx))); - output[qx] = {d00 * phi_j + d01 * dphi_j_dxi, d10 * phi_j + d11 * dphi_j_dxi}; + (*output)[qx] = {d00 * phi_j + d01 * dphi_j_dxi, d10 * phi_j + d11 * dphi_j_dxi}; } + } - return output; + template + static auto interpolate_output_helper() + { + return tensor{}; } template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static void interpolate(const dof_type& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); @@ -137,27 +149,23 @@ struct finite_element > { } // transpose the quadrature data into a tensor of tuples - tensor output; - for (int qx = 0; qx < q; qx++) { if constexpr (c == 1) { - get(output(qx)) = value(0, qx); - get(output(qx)) = gradient(0, qx); + get((*output_ptr)(qx)) = value(0, qx); + get((*output_ptr)(qx)) = gradient(0, qx); } else { for (int i = 0; i < c; i++) { - get(output(qx))[i] = value(i, qx); - get(output(qx))[i] = gradient(i, qx); + get((*output_ptr)(qx))[i] = value(i, qx); + get((*output_ptr)(qx))[i] = gradient(i, qx); } } } - - return output; } template SERAC_HOST_DEVICE static void integrate(const tensor, q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - [[maybe_unused]] int step = 1) + RAJA::LaunchContext, [[maybe_unused]] int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; diff --git a/src/serac/numerics/functional/detail/segment_Hcurl.inl b/src/serac/numerics/functional/detail/segment_Hcurl.inl index 98677f684..9e1986aa7 100644 --- a/src/serac/numerics/functional/detail/segment_Hcurl.inl +++ b/src/serac/numerics/functional/detail/segment_Hcurl.inl @@ -18,8 +18,8 @@ // note: mfem assumes the parent element domain is [0,1] // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::SEGMENT; static constexpr auto family = Family::HCURL; static constexpr int components = c; diff --git a/src/serac/numerics/functional/detail/segment_L2.inl b/src/serac/numerics/functional/detail/segment_L2.inl index d14603793..fffd55724 100644 --- a/src/serac/numerics/functional/detail/segment_L2.inl +++ b/src/serac/numerics/functional/detail/segment_L2.inl @@ -10,14 +10,16 @@ * @brief Specialization of finite_element for L2 on segment geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // // note: mfem assumes the parent element domain is [0,1] // for additional information on the finite_element concept requirements, see finite_element.hpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::SEGMENT; static constexpr auto family = Family::L2; static constexpr int components = c; @@ -37,6 +39,14 @@ struct finite_element > { using residual_type = typename std::conditional, tensor >::type; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(T{})) + get<1>(get<0>(T{}))); + using flux_t = decltype(get<0>(get<1>(T{})) + get<1>(get<1>(T{}))); + + using type = tensor, q>; + }; + SERAC_HOST_DEVICE static constexpr tensor shape_functions(double xi) { return GaussLobattoInterpolation(xi); @@ -94,17 +104,14 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int jx, tensor input, const TensorProductQuadratureRule&) + static void RAJA_HOST_DEVICE batch_apply_shape_fn(int jx, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); static constexpr auto G = calculate_G(); - using source_t = decltype(get<0>(get<0>(T{})) + get<1>(get<0>(T{}))); - using flux_t = decltype(get<0>(get<1>(T{})) + get<1>(get<1>(T{}))); - - tensor, q> output; - for (int qx = 0; qx < q; qx++) { double phi_j = B(qx, jx); double dphi_j_dxi = G(qx, jx); @@ -114,14 +121,19 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(qx))); auto& d11 = get<1>(get<1>(input(qx))); - output[qx] = {d00 * phi_j + d01 * dphi_j_dxi, d10 * phi_j + d11 * dphi_j_dxi}; + (*output)[qx] = {d00 * phi_j + d01 * dphi_j_dxi, d10 * phi_j + d11 * dphi_j_dxi}; } + } - return output; + template + static auto interpolate_output_helper() + { + return tensor{}; } template - SERAC_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static void interpolate(const dof_type& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext) { static constexpr bool apply_weights = false; static constexpr auto B = calculate_B(); @@ -136,28 +148,23 @@ struct finite_element > { gradient(i) = dot(G, X[i]); } - // transpose the quadrature data into a tensor of tuples - tensor output; - for (int qx = 0; qx < q; qx++) { if constexpr (c == 1) { - get(output(qx)) = value(0, qx); - get(output(qx)) = gradient(0, qx); + get((*output_ptr)(qx)) = value(0, qx); + get((*output_ptr)(qx)) = gradient(0, qx); } else { for (int i = 0; i < c; i++) { - get(output(qx))[i] = value(i, qx); - get(output(qx))[i] = gradient(i, qx); + get((*output_ptr)(qx))[i] = value(i, qx); + get((*output_ptr)(qx))[i] = gradient(i, qx); } } } - - return output; } template SERAC_HOST_DEVICE static void integrate(const tensor, q>& qf_output, const TensorProductQuadratureRule&, dof_type* element_residual, - [[maybe_unused]] int step = 1) + RAJA::LaunchContext, [[maybe_unused]] int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; diff --git a/src/serac/numerics/functional/detail/tetrahedron_H1.inl b/src/serac/numerics/functional/detail/tetrahedron_H1.inl index 2a7613cee..df229336d 100644 --- a/src/serac/numerics/functional/detail/tetrahedron_H1.inl +++ b/src/serac/numerics/functional/detail/tetrahedron_H1.inl @@ -10,6 +10,8 @@ * @brief Specialization of finite_element for H1 on tetrahedron geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // @@ -19,8 +21,8 @@ // for detailed information about node locations for different polynomial orders, see // simplex_basis_function_unit_tests.cpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::TETRAHEDRON; static constexpr auto family = Family::H1; static constexpr int components = c; @@ -34,15 +36,22 @@ struct finite_element > { static constexpr int SOURCE = 0, FLUX = 1; using residual_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); + using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); + using type = tensor, nqpts(q)>; + }; + SERAC_HOST_DEVICE static constexpr double shape_function([[maybe_unused]] tensor xi, int i) { if constexpr (p == 1) { @@ -336,16 +345,14 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext& ctx) { - using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); - using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); - constexpr auto xi = GaussLegendreNodes(); - tensor, nqpts(q)> output; - - for (int i = 0; i < nqpts(q); i++) { + auto x_range = RAJA::RangeSegment(0, nqpts(q)); + RAJA::loop::threads_t>(ctx, x_range, [&](int i) { double phi_j = shape_function(xi[i], j); tensor dphi_j_dxi = shape_function_gradient(xi[i], j); @@ -354,21 +361,28 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(i))); auto& d11 = get<1>(get<1>(input(i))); - output[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; - } + (*output)[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; + }); return output; } template - SERAC_HOST_DEVICE static auto interpolate(const tensor& X, const TensorProductQuadratureRule&) + static auto interpolate_output_helper() + { + return tensor{}; + } + + template + SERAC_HOST_DEVICE static void interpolate(const tensor& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext ctx) { constexpr auto xi = GaussLegendreNodes(); // transpose the quadrature data into a flat tensor of tuples union { - tensor, tensor >, nqpts(q)> unflattened; - tensor flattened; + tensor, tensor>, nqpts(q)> unflattened; + tensor flattened; } output{}; for (int i = 0; i < c; i++) { @@ -380,20 +394,27 @@ struct finite_element > { } } - return output.flattened; + RAJA::TypedRangeSegment x_range(0, BLOCK_SZ); + if (output_ptr) { + RAJA::loop>(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.flattened)) { + (*output_ptr)[tid] = output.flattened[tid]; + } + }); + } } template SERAC_HOST_DEVICE static void integrate(const tensor, nqpts(q)>& qf_output, const TensorProductQuadratureRule&, - tensor* element_residual, int step = 1) + tensor* element_residual, RAJA::LaunchContext, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; } using source_component_type = std::conditional_t{}, zero, double>; - using flux_component_type = std::conditional_t{}, zero, tensor >; + using flux_component_type = std::conditional_t{}, zero, tensor>; constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; constexpr auto integration_points = GaussLegendreNodes(); diff --git a/src/serac/numerics/functional/detail/tetrahedron_L2.inl b/src/serac/numerics/functional/detail/tetrahedron_L2.inl index 25bc8ffd3..092ca905b 100644 --- a/src/serac/numerics/functional/detail/tetrahedron_L2.inl +++ b/src/serac/numerics/functional/detail/tetrahedron_L2.inl @@ -10,6 +10,8 @@ * @brief Specialization of finite_element for L2 on tetrahedron geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // @@ -18,8 +20,8 @@ // // for exact positions of nodes for different polynomial orders, see simplex_basis_function_unit_tests.cpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::TETRAHEDRON; static constexpr auto family = Family::L2; static constexpr int components = c; @@ -32,15 +34,22 @@ struct finite_element > { static constexpr int SOURCE = 0, FLUX = 1; using residual_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); + using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); + using type = tensor, nqpts(q)>; + }; + SERAC_HOST_DEVICE static constexpr double shape_function([[maybe_unused]] tensor xi, [[maybe_unused]] int i) { @@ -341,15 +350,12 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { - using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); - using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); - constexpr auto xi = GaussLegendreNodes(); - tensor, nqpts(q)> output; - for (int i = 0; i < nqpts(q); i++) { double phi_j = shape_function(xi[i], j); tensor dphi_j_dxi = shape_function_gradient(xi[i], j); @@ -359,21 +365,28 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(i))); auto& d11 = get<1>(get<1>(input(i))); - output[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; + (*output)[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; } return output; } template - SERAC_HOST_DEVICE static auto interpolate(const tensor& X, const TensorProductQuadratureRule&) + static auto interpolate_output_helper() + { + return tensor{}; + } + + template + SERAC_HOST_DEVICE static void interpolate(const tensor& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext ctx) { constexpr auto xi = GaussLegendreNodes(); // transpose the quadrature data into a flat tensor of tuples union { - tensor, tensor >, nqpts(q)> unflattened; - tensor flattened; + tensor, tensor>, nqpts(q)> unflattened; + tensor flattened; } output{}; for (int i = 0; i < c; i++) { @@ -385,20 +398,28 @@ struct finite_element > { } } - return output.flattened; + RAJA::TypedRangeSegment x_range(0, BLOCK_SZ); + if (output_ptr) { + RAJA::loop>(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.flattened)) { + get(((*output_ptr))[tid]) = get(output.flattened[tid]); + get(((*output_ptr))[tid]) = get(output.flattened[tid]); + } + }); + } } template SERAC_HOST_DEVICE static void integrate(const tensor, nqpts(q)>& qf_output, const TensorProductQuadratureRule&, - tensor* element_residual, int step = 1) + tensor* element_residual, RAJA::LaunchContext, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; } using source_component_type = std::conditional_t{}, zero, double>; - using flux_component_type = std::conditional_t{}, zero, tensor >; + using flux_component_type = std::conditional_t{}, zero, tensor>; constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; constexpr auto integration_points = GaussLegendreNodes(); diff --git a/src/serac/numerics/functional/detail/triangle_H1.inl b/src/serac/numerics/functional/detail/triangle_H1.inl index 37c1600fa..6b4b5623d 100644 --- a/src/serac/numerics/functional/detail/triangle_H1.inl +++ b/src/serac/numerics/functional/detail/triangle_H1.inl @@ -10,6 +10,8 @@ * @brief Specialization of finite_element for H1 on triangle geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // @@ -19,8 +21,8 @@ // for detailed information about node locations for different polynomial orders, see // simplex_basis_function_unit_tests.cpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::TRIANGLE; static constexpr auto family = Family::H1; static constexpr int components = c; @@ -33,15 +35,23 @@ struct finite_element > { static constexpr int SOURCE = 0, FLUX = 1; using residual_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); + using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); + static constexpr int Q = q * (q + 1) / 2; + using type = tensor, Q>; + }; + /* interpolation nodes and their associated numbering: @@ -240,15 +250,12 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static void RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { - using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); - using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); - - constexpr auto xi = GaussLegendreNodes(); - - static constexpr int Q = q * (q + 1) / 2; - tensor, Q> output; + constexpr auto xi = GaussLegendreNodes(); + static constexpr int Q = q * (q + 1) / 2; for (int i = 0; i < Q; i++) { double phi_j = shape_function(xi[i], j); @@ -259,22 +266,27 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(i))); auto& d11 = get<1>(get<1>(input(i))); - output[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; + (*output)[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; } + } - return output; + template + static auto interpolate_output_helper() + { + return tensor{}; } template - SERAC_HOST_DEVICE static auto interpolate(const tensor& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static void interpolate(const tensor& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext ctx) { constexpr auto xi = GaussLegendreNodes(); static constexpr int num_quadrature_points = q * (q + 1) / 2; // transpose the quadrature data into a flat tensor of tuples union { - tensor, tensor >, num_quadrature_points> unflattened; - tensor flattened; + tensor, tensor>, num_quadrature_points> unflattened; + tensor flattened; } output{}; for (int i = 0; i < c; i++) { @@ -286,20 +298,29 @@ struct finite_element > { } } - return output.flattened; + RAJA::TypedRangeSegment x_range(0, BLOCK_SZ); + + if (output_ptr) { + RAJA::loop>(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.flattened)) { + get(((*output_ptr))[tid]) = get(output.flattened[tid]); + get(((*output_ptr))[tid]) = get(output.flattened[tid]); + } + }); + } } template SERAC_HOST_DEVICE static void integrate(const tensor, q*(q + 1) / 2>& qf_output, const TensorProductQuadratureRule&, - tensor* element_residual, int step = 1) + tensor* element_residual, RAJA::LaunchContext, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; } using source_component_type = std::conditional_t{}, zero, double>; - using flux_component_type = std::conditional_t{}, zero, tensor >; + using flux_component_type = std::conditional_t{}, zero, tensor>; constexpr int num_quadrature_points = q * (q + 1) / 2; constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; diff --git a/src/serac/numerics/functional/detail/triangle_L2.inl b/src/serac/numerics/functional/detail/triangle_L2.inl index 2030008da..2005ca150 100644 --- a/src/serac/numerics/functional/detail/triangle_L2.inl +++ b/src/serac/numerics/functional/detail/triangle_L2.inl @@ -10,6 +10,8 @@ * @brief Specialization of finite_element for L2 on triangle geometry */ +#include "RAJA/RAJA.hpp" + // this specialization defines shape functions (and their gradients) that // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order // @@ -18,8 +20,8 @@ // // for exact positions of nodes for different polynomial orders, see simplex_basis_function_unit_tests.cpp /// @cond -template -struct finite_element > { +template +struct finite_element, exec> { static constexpr auto geometry = mfem::Geometry::TRIANGLE; static constexpr auto family = Family::L2; static constexpr int components = c; @@ -31,15 +33,23 @@ struct finite_element > { static constexpr int SOURCE = 0, FLUX = 1; using residual_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using dof_type = tensor; - using value_type = typename std::conditional >::type; + using value_type = typename std::conditional>::type; using derivative_type = - typename std::conditional, tensor >::type; + typename std::conditional, tensor>::type; using qf_input_type = tuple; + template + struct batch_apply_shape_fn_output { + using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); + using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); + static constexpr int Q = q * (q + 1) / 2; + using type = tensor, Q>; + }; + /* interpolation nodes and their associated numbering: @@ -249,16 +259,12 @@ struct finite_element > { } template - static auto batch_apply_shape_fn(int j, tensor input, const TensorProductQuadratureRule&) + static auto RAJA_HOST_DEVICE batch_apply_shape_fn(int j, tensor input, + typename batch_apply_shape_fn_output::type* output, + const TensorProductQuadratureRule&, RAJA::LaunchContext) { - using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor{})); - using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor{})); - - constexpr auto xi = GaussLegendreNodes(); - - static constexpr int Q = q * (q + 1) / 2; - tensor, Q> output; - + constexpr auto xi = GaussLegendreNodes(); + static constexpr int Q = q * (q + 1) / 2; for (int i = 0; i < Q; i++) { double phi_j = shape_function(xi[i], j); tensor dphi_j_dxi = shape_function_gradient(xi[i], j); @@ -268,22 +274,27 @@ struct finite_element > { auto& d10 = get<0>(get<1>(input(i))); auto& d11 = get<1>(get<1>(input(i))); - output[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; + (*output)[i] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)}; } + } - return output; + template + static auto interpolate_output_helper() + { + return tensor{}; } template - SERAC_HOST_DEVICE static auto interpolate(const tensor& X, const TensorProductQuadratureRule&) + SERAC_HOST_DEVICE static void interpolate(const tensor& X, const TensorProductQuadratureRule&, + tensor* output_ptr, RAJA::LaunchContext ctx) { constexpr auto xi = GaussLegendreNodes(); static constexpr int num_quadrature_points = q * (q + 1) / 2; // transpose the quadrature data into a flat tensor of tuples union { - tensor, tensor >, num_quadrature_points> unflattened; - tensor flattened; + tensor, tensor>, num_quadrature_points> unflattened; + tensor flattened; } output{}; for (int i = 0; i < c; i++) { @@ -295,20 +306,28 @@ struct finite_element > { } } - return output.flattened; + RAJA::TypedRangeSegment x_range(0, BLOCK_SZ); + if (output_ptr) { + RAJA::loop>(ctx, x_range, [&](int tid) { + if (tid < serac::size(output.flattened)) { + get(((*output_ptr))[tid]) = get(output.flattened[tid]); + get(((*output_ptr))[tid]) = get(output.flattened[tid]); + } + }); + } } template SERAC_HOST_DEVICE static void integrate(const tensor, q*(q + 1) / 2>& qf_output, const TensorProductQuadratureRule&, - tensor* element_residual, int step = 1) + tensor* element_residual, RAJA::LaunchContext, int step = 1) { if constexpr (is_zero{} && is_zero{}) { return; } using source_component_type = std::conditional_t{}, zero, double>; - using flux_component_type = std::conditional_t{}, zero, tensor >; + using flux_component_type = std::conditional_t{}, zero, tensor>; constexpr int num_quadrature_points = q * (q + 1) / 2; constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c; diff --git a/src/serac/numerics/functional/dof_numbering.hpp b/src/serac/numerics/functional/dof_numbering.hpp index 089e3065c..ba607c5d1 100644 --- a/src/serac/numerics/functional/dof_numbering.hpp +++ b/src/serac/numerics/functional/dof_numbering.hpp @@ -333,6 +333,7 @@ struct GradientAssemblyLookupTables { col_ind.resize(nnz); row_ptr[0] = 0; + col_ind[0] = int(entries[0].column); for (uint32_t i = 1; i < nnz; i++) { diff --git a/src/serac/numerics/functional/domain.cpp b/src/serac/numerics/functional/domain.cpp index 3d39e2d0a..74ade55ec 100644 --- a/src/serac/numerics/functional/domain.cpp +++ b/src/serac/numerics/functional/domain.cpp @@ -240,10 +240,10 @@ static Domain domain_of_elems(const mfem::Mesh& mfem::Vector vertices; mesh.GetVertices(vertices); - int tri_id = 0; - int quad_id = 0; - int tet_id = 0; - int hex_id = 0; + [[maybe_unused]] int tri_id = 0; + [[maybe_unused]] int quad_id = 0; + [[maybe_unused]] int tet_id = 0; + [[maybe_unused]] int hex_id = 0; // elements that satisfy the predicate are added to the domain int num_elems = mesh.GetNE(); diff --git a/src/serac/numerics/functional/domain_integral_kernels.hpp b/src/serac/numerics/functional/domain_integral_kernels.hpp index c218025fb..d18bdf62b 100644 --- a/src/serac/numerics/functional/domain_integral_kernels.hpp +++ b/src/serac/numerics/functional/domain_integral_kernels.hpp @@ -12,6 +12,8 @@ #include "serac/numerics/functional/differentiate_wrt.hpp" #include "RAJA/RAJA.hpp" +#include +#include #include #include @@ -64,6 +66,41 @@ struct QFunctionArgument, Dimension<3>> { using type = tuple, tensor>; ///< what will be passed to the q-function }; +/** + * @tparam lambda the qfunction type + * @tparam state_type describes state type, if there is one + * @tparam dim describes whether the problem is 1D, 2D, or 3D + * @tparam n length of output tensor + * @tparam T... extra args + * + * @brief a struct used to deduce the return type of a q-function. + */ +template +struct QFunctionOutput { + /// @brief Typedef for the position argument of a qf func + using position_type = serac::tuple, tensor>; + /// @brief static method for deducing the return type of a given qf function. + /// Many objects of type lambda are actually functors that aren't trivially + /// constructable with closed brace constructor {}, so qf must be passed in + /// to a function. + static auto get_type(const lambda& qf) + { + state_type state_l_value{}; + return tensor{}; + } +}; + +/// @overload +template +struct QFunctionOutput { + /// @brief Typedef for the position argument of a qf func + using position_type = serac::tuple, tensor>; + /// @brief This method is identical to the above get_type method but doesn't + /// trivially construct a state, because this partial specialization is for + /// Nothing type states. + static auto get_type(const lambda& qf) { return tensor{}; } +}; + /// @brief layer of indirection needed to unpack the entries of the argument tuple SERAC_SUPPRESS_NVCC_HOSTDEVICE_WARNING template @@ -102,14 +139,14 @@ auto get_derivative_type(const lambda& qf, qpt_data_type qpt_data) make_dual_wrt(qf_arguments{}))); }; -template -SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(const lambda& qf, double t, const tensor& x, - const tensor& J, const T&... inputs) +template +SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata( + const lambda& qf, double t, const tensor x, const tensor J, + decltype(QFunctionOutput::get_type(qf))* qf_output, RAJA::LaunchContext ctx, + const T&... inputs) { - using position_t = serac::tuple, tensor>; - using return_type = decltype(qf(double{}, position_t{}, T{}[0]...)); - tensor outputs{}; - for (int i = 0; i < n; i++) { + RAJA::RangeSegment x_range(0, n); + RAJA::loop::threads_t>(ctx, x_range, [&](int i) { tensor x_q; tensor J_q; for (int j = 0; j < dim; j++) { @@ -118,20 +155,18 @@ SERAC_HOST_DEVICE auto batch_apply_qf_no_qdata(const lambda& qf, double t, const } x_q[j] = x(j, i); } - outputs[i] = qf(t, serac::tuple{x_q, J_q}, inputs[i]...); - } - return outputs; + (*qf_output)[i] = qf(t, serac::tuple{x_q, J_q}, inputs[i]...); + }); } -template -SERAC_HOST_DEVICE auto batch_apply_qf(const lambda& qf, double t, const tensor& x, - const tensor& J, qpt_data_type* qpt_data, bool update_state, - const T&... inputs) +template +SERAC_HOST_DEVICE auto batch_apply_qf( + const lambda& qf, double t, const tensor x, const tensor J, + decltype(QFunctionOutput::get_type(qf))* qf_output, qpt_data_type* qpt_data, + bool update_state, RAJA::LaunchContext ctx, const T&... inputs) { - using position_t = serac::tuple, tensor>; - using return_type = decltype(qf(double{}, position_t{}, qpt_data[0], T{}[0]...)); - tensor outputs{}; - for (int i = 0; i < n; i++) { + RAJA::RangeSegment x_range(0, n); + RAJA::loop::threads_t>(ctx, x_range, [&](int i) { tensor x_q; tensor J_q; for (int j = 0; j < dim; j++) { @@ -140,19 +175,18 @@ SERAC_HOST_DEVICE auto batch_apply_qf(const lambda& qf, double t, const tensor -void evaluation_kernel_impl(trial_element_tuple trial_elements, test_element, double t, +template +void evaluation_kernel_impl(trial_element_tuple_type trial_elements, test_element_type, double t, const std::vector& inputs, double* outputs, const double* positions, const double* jacobians, lambda_type qf, [[maybe_unused]] axom::ArrayView qf_state, @@ -161,62 +195,117 @@ void evaluation_kernel_impl(trial_element_tuple trial_elements, test_element, do { // mfem provides this information as opaque arrays of doubles, // so we reinterpret the pointer with - auto r = reinterpret_cast(outputs); - auto x = reinterpret_cast::type*>(positions); - auto J = reinterpret_cast::type*>(jacobians); + using X_Type = typename batched_position::type; + using J_Type = typename batched_jacobian::type; + auto r = reinterpret_cast(outputs); + auto x = const_cast(reinterpret_cast(positions)); + auto J = const_cast(reinterpret_cast(jacobians)); + TensorProductQuadratureRule rule{}; - [[maybe_unused]] auto qpts_per_elem = num_quadrature_points(geom, Q); + auto qpts_per_elem = num_quadrature_points(geom, Q); + constexpr int dim = dimension<0>(X_Type{}); + constexpr int n = dimension<1>(X_Type{}); [[maybe_unused]] tuple u = { reinterpret_cast(trial_elements))::dof_type*>(inputs[indices])...}; - - // for each element in the domain - for (uint32_t e = 0; e < num_elements; ++e) { - // load the jacobians and positions for each quadrature point in this element - auto J_e = J[e]; - auto x_e = x[e]; - - //[[maybe_unused]] static constexpr trial_element_tuple trial_element_tuple{}; - // batch-calculate values / derivatives of each trial space, at each quadrature point - [[maybe_unused]] tuple qf_inputs = {promote_each_to_dual_when( - get(trial_elements).interpolate(get(u)[elements[e]], rule))...}; - - // use J_e to transform values / derivatives on the parent element - // to the to the corresponding values / derivatives on the physical element - (parent_to_physical(trial_elements).family>(get(qf_inputs), J_e), ...); - - // (batch) evalute the q-function at each quadrature point - // - // note: the weird immediately-invoked lambda expression is - // a workaround for a bug in GCC(<12.0) where it fails to - // decide which function overload to use, and crashes - auto qf_outputs = [&]() { - if constexpr (std::is_same_v) { - return batch_apply_qf_no_qdata(qf, t, x_e, J_e, get(qf_inputs)...); - } else { - return batch_apply_qf(qf, t, x_e, J_e, &qf_state(e, 0), update_state, get(qf_inputs)...); - } - }(); - - // use J to transform sources / fluxes on the physical element - // back to the corresponding sources / fluxes on the parent element - physical_to_parent(qf_outputs, J_e); - - // write out the q-function derivatives after applying the - // physical_to_parent transformation, so that those transformations - // won't need to be applied in the action_of_gradient and element_gradient kernels - if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) { - for (int q = 0; q < leading_dimension(qf_outputs); q++) { - qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[q]); - } - } - - // (batch) integrate the material response against the test-space basis functions - test_element::integrate(get_value(qf_outputs), rule, &r[elements[e]]); - } - - return; + // Type definitions for the intermediate data structures used during kernel evalutaion + using interpolate_out_type = decltype(tuple{get(trial_elements).template interpolate_output_helper()...}); + using qf_inputs_type = decltype(tuple{promote_each_to_dual_when_output_helper( + get(trial_elements).template interpolate_output_helper())...}); + using qf_outputs_type = + decltype(QFunctionOutput(qf_inputs_type{}))...>::get_type( + qf)); + + // Determine allocation destination based on exec parameter + std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST"; + auto& rm = umpire::ResourceManager::getInstance(); + auto allocator = rm.getAllocator(device_name); + // Perform allocations + qf_inputs_type* qf_inputs = static_cast(allocator.allocate(sizeof(qf_inputs_type) * num_elements)); + interpolate_out_type* interpolate_result = + static_cast(allocator.allocate(sizeof(interpolate_out_type) * num_elements)); + qf_outputs_type* qf_outputs = + static_cast(allocator.allocate(sizeof(qf_outputs_type) * num_elements)); + // Zero out memory + rm.memset(qf_inputs, 0); + rm.memset(qf_outputs, 0); + rm.memset(interpolate_result, 0); + + // Launch the execution kernel. On GPU hardware this kernel dispatches one element per thread block and executes + // as many computations in shared memory as possible. + auto e_range = RAJA::TypedRangeSegment(0, num_elements); + RAJA::launch::launch_t>( + RAJA::LaunchParams(RAJA::Teams(static_cast(num_elements)), RAJA::Threads(BLOCK_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop::teams_t>( + ctx, e_range, + // The explicit capture list is needed here because the capture occurs in a function template with a + // variadic non-type parameter. + [&ctx, t, J, x, u, trial_elements, qf, qpts_per_elem, rule, r, qf_state, elements, qf_derivatives, + qf_inputs, qf_outputs, interpolate_result, update_state](uint32_t e) { + detail::suppress_capture_warnings(qf_state, qf_derivatives, update_state, qpts_per_elem, trial_elements, + qf_inputs, interpolate_result, u); + + [[maybe_unused]] static constexpr trial_element_tuple_type empty_trial_element{}; + // batch-calculate values / derivatives of each trial space, at each quadrature point + (get(trial_elements) + .interpolate(get(u)[elements[e]], rule, &get(interpolate_result[e]), ctx), + ...); + ctx.teamSync(); + + (promote_each_to_dual_when(get(interpolate_result[e]), + &get(qf_inputs[e]), ctx), + ...); + ctx.teamSync(); + + // use J_e to transform values / derivatives on the parent element + // to the to the corresponding values / derivatives on the physical element + (parent_to_physical(empty_trial_element).family, exec>(get(qf_inputs[e]), J, e, + ctx), + ...); + ctx.teamSync(); + + // (batch) evalute the q-function at each quadrature point + // + // note: the weird immediately-invoked lambda expression is + // a workaround for a bug in GCC(<12.0) where it fails to + // decide which function overload to use, and crashes + if constexpr (std::is_same_v) { + batch_apply_qf_no_qdata(qf, t, x[e], J[e], &(qf_outputs[e]), ctx, get(qf_inputs[e])...); + } + + if constexpr (!std::is_same_v) { + batch_apply_qf(qf, t, x[e], J[e], &(qf_outputs[e]), &qf_state(e, 0), update_state, ctx, + get(qf_inputs[e])...); + } + ctx.teamSync(); + + // use J to transform sources / fluxes on the physical element + // back to the corresponding sources / fluxes on the parent element + physical_to_parent(qf_outputs[e], J, e, ctx); + ctx.teamSync(); + + // write out the q-function derivatives after applying the + // physical_to_parent transformation, so that those transformations + // won't need to be applied in the action_of_gradient and element_gradient kernels + if constexpr (differentiation_index != serac::NO_DIFFERENTIATION) { + RAJA::RangeSegment x_range(0, leading_dimension(qf_outputs[e])); + RAJA::loop::threads_t>(ctx, x_range, [&](int q) { + qf_derivatives[e * uint32_t(qpts_per_elem) + uint32_t(q)] = get_gradient(qf_outputs[e][q]); + }); + } + ctx.teamSync(); + + // (batch) integrate the material response against the test-space basis functions + test_element_type::integrate(get_value(qf_outputs[e]), rule, &r[elements[e]], ctx); + ctx.teamSync(); + }); + }); + + allocator.deallocate(qf_inputs); + allocator.deallocate(interpolate_result); + allocator.deallocate(qf_outputs); } //clang-format off @@ -237,15 +326,14 @@ SERAC_HOST_DEVICE auto chain_rule(const S& dfdx, const T& dx) } //clang-format on -template -SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor& inputs) +template +SERAC_HOST_DEVICE void batch_apply_chain_rule(derivative_type* qf_derivatives, const tensor& inputs, + tensor(derivative_type{}, T{})), n>& outputs, + const RAJA::LaunchContext& ctx) { - using return_type = decltype(chain_rule(derivative_type{}, T{})); - tensor outputs{}; - for (int i = 0; i < n; i++) { - outputs[i] = chain_rule(qf_derivatives[i], inputs[i]); - } - return outputs; + RAJA::RangeSegment i_range(0, n); + RAJA::loop::threads_t>( + ctx, i_range, [&](int i) { outputs[i] = chain_rule(qf_derivatives[i], inputs[i]); }); } /** @@ -274,33 +362,55 @@ SERAC_HOST_DEVICE auto batch_apply_chain_rule(derivative_type* qf_derivatives, c * @param[in] num_elements The number of elements in the mesh */ -template +template void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* qf_derivatives, const int* elements, std::size_t num_elements) { - using test_element = finite_element; - using trial_element = finite_element; + std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST"; + + using test_element = finite_element; + using trial_element = finite_element; + using qf_inputs_type = decltype(trial_element::template interpolate_output_helper()); constexpr bool is_QOI = (test::family == Family::QOI); constexpr int num_qpts = num_quadrature_points(g, Q); - // mfem provides this information in 1D arrays, so we reshape it - // into strided multidimensional arrays before using + // mfem provides this information in 1D arrays, so we reshape it into strided multidimensional arrays before using auto du = reinterpret_cast(dU); auto dr = reinterpret_cast(dR); constexpr TensorProductQuadratureRule rule{}; - // for each element in the domain - for (uint32_t e = 0; e < num_elements; e++) { - // (batch) interpolate each quadrature point's value - auto qf_inputs = trial_element::interpolate(du[elements[e]], rule); - - // (batch) evalute the q-function at each quadrature point - auto qf_outputs = batch_apply_chain_rule(qf_derivatives + e * num_qpts, qf_inputs); - - // (batch) integrate the material response against the test-space basis functions - test_element::integrate(qf_outputs, rule, &dr[elements[e]]); - } + // Determine allocation destination based on exec parameter + auto& rm = umpire::ResourceManager::getInstance(); + // Perform allocations + auto allocator = rm.getAllocator(device_name); + qf_inputs_type* qf_inputs = static_cast(allocator.allocate(sizeof(qf_inputs_type) * num_elements)); + // Zero out memory + rm.memset(qf_inputs, 0); + + // This typedef is needed to declare qf_outputs in shared memory. + using chain_rule_dtype = decltype(chain_rule(derivatives_type{}, typename trial_element::qf_input_type{})); + using qf_outputs_type = tensor; + // Launch the execution kernel. On GPU hardware this kernel dispatches one element per thread block and executes + // as many computations in shared memory as possible. + auto e_range = RAJA::TypedRangeSegment(0, num_elements); + RAJA::launch::launch_t>( + RAJA::LaunchParams(RAJA::Teams(static_cast(num_elements)), RAJA::Threads(BLOCK_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop::teams_t>(ctx, e_range, [&](int e) { + // (batch) interpolate each quadrature point's value + trial_element::interpolate(du[elements[e]], rule, &(qf_inputs[e]), ctx); + ctx.teamSync(); + + RAJA_TEAM_SHARED qf_outputs_type qf_outputs; + batch_apply_chain_rule(qf_derivatives + e * num_qpts, qf_inputs[e], qf_outputs, ctx); + ctx.teamSync(); + + test_element::integrate(qf_outputs, rule, &dr[elements[e]], ctx); + ctx.teamSync(); + }); + }); + rm.deallocate(qf_inputs); } /** @@ -324,76 +434,102 @@ void action_of_gradient_kernel(const double* dU, double* dR, derivatives_type* q * @see mfem::GeometricFactors * @param[in] num_elements The number of elements in the mesh */ -template -void element_gradient_kernel(ExecArrayView dK, derivatives_type* qf_derivatives, - const int* elements, std::size_t num_elements) +template +void element_gradient_kernel(ExecArrayView dK, derivatives_type* qf_derivatives, const int* elements, + std::size_t num_elements) { // quantities of interest have no flux term, so we pad the derivative // tuple with a "zero" type in the second position to treat it like the standard case - constexpr bool is_QOI = test::family == Family::QOI; + constexpr bool is_QOI = test::family == Family::QOI; + constexpr int nquad = num_quadrature_points(g, Q); + std::string device_name = exec == ExecutionSpace::GPU ? "DEVICE" : "HOST"; + constexpr TensorProductQuadratureRule rule{}; using padded_derivative_type = std::conditional_t, derivatives_type>; - - using test_element = finite_element; - using trial_element = finite_element; - - constexpr int nquad = num_quadrature_points(g, Q); - - static constexpr TensorProductQuadratureRule rule{}; - - // for each element in the domain - for (uint32_t e = 0; e < num_elements; e++) { - auto* output_ptr = reinterpret_cast(&dK(elements[e], 0, 0)); - - tensor derivatives{}; - for (int q = 0; q < nquad; q++) { - if constexpr (is_QOI) { - get<0>(derivatives(q)) = qf_derivatives[e * nquad + uint32_t(q)]; - } else { - derivatives(q) = qf_derivatives[e * nquad + uint32_t(q)]; - } - } - - for (int J = 0; J < trial_element::ndof; J++) { - auto source_and_flux = trial_element::batch_apply_shape_fn(J, derivatives, rule); - test_element::integrate(source_and_flux, rule, output_ptr + J, trial_element::ndof); - } - } + using test_element = finite_element; + using trial_element = finite_element; + + auto& rm = umpire::ResourceManager::getInstance(); + auto allocator = rm.getAllocator(device_name); + // TODO(cuda): Use a dynamic shared memory buffer to avoid allocating device memory here. + tensor* derivatives = static_cast*>( + allocator.allocate(sizeof(tensor) * num_elements)); + + // Launch the execution kernel. On GPU hardware this kernel dispatches one element per thread block and executes + // as many computations in shared memory as possible. + RAJA::TypedRangeSegment elements_range(0, num_elements); + RAJA::launch::launch_t>( + RAJA::LaunchParams(RAJA::Teams(static_cast(num_elements)), RAJA::Threads(BLOCK_SZ)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + RAJA::loop::teams_t>(ctx, elements_range, [&](uint32_t e) { + static constexpr bool is_QOI_2 = test::family == Family::QOI; + [[maybe_unused]] auto* output_ptr = + reinterpret_cast(&dK(elements[e], 0, 0)); + + RAJA::RangeSegment x_range(0, nquad); + RAJA::loop::threads_t>( + ctx, x_range, [&derivatives, qf_derivatives, nquad, e](int q) { + detail::suppress_capture_warnings(nquad); + (void)nquad; + if constexpr (is_QOI_2) { + get<0>(derivatives[e](q)) = qf_derivatives[e * nquad + uint32_t(q)]; + } + if constexpr (!is_QOI_2) { + derivatives[e](q) = qf_derivatives[e * nquad + uint32_t(q)]; + } + }); + + ctx.teamSync(); + + RAJA_TEAM_SHARED typename trial_element::template batch_apply_shape_fn_output::type + source_and_flux; + for (int J = 0; J < trial_element::ndof; ++J) { + trial_element::batch_apply_shape_fn(J, derivatives[e], &source_and_flux, rule, ctx); + ctx.teamSync(); + + test_element::integrate(source_and_flux, rule, output_ptr + J, ctx, trial_element::ndof); + ctx.teamSync(); + } + }); + }); + rm.deallocate(derivatives); } -template +template auto evaluation_kernel(signature s, const lambda_type& qf, const double* positions, const double* jacobians, std::shared_ptr> qf_state, std::shared_ptr qf_derivatives, const int* elements, uint32_t num_elements) { - auto trial_elements = trial_elements_tuple(s); - auto test_element = get_test_element(s); + auto trial_element_tuple = trial_elements_tuple(s); + auto test_element = get_test_element(s); return [=](double time, const std::vector& inputs, double* outputs, bool update_state) { - domain_integral::evaluation_kernel_impl( - trial_elements, test_element, time, inputs, outputs, positions, jacobians, qf, (*qf_state)[geom], + domain_integral::evaluation_kernel_impl( + trial_element_tuple, test_element, time, inputs, outputs, positions, jacobians, qf, (*qf_state)[geom], qf_derivatives.get(), elements, num_elements, update_state, s.index_seq); }; } -template +template std::function jacobian_vector_product_kernel( signature, std::shared_ptr qf_derivatives, const int* elements, uint32_t num_elements) { return [=](const double* du, double* dr) { using test_space = typename signature::return_type; using trial_space = typename std::tuple_element::type; - action_of_gradient_kernel(du, dr, qf_derivatives.get(), elements, num_elements); + action_of_gradient_kernel(du, dr, qf_derivatives.get(), elements, + num_elements); }; } -template -std::function)> element_gradient_kernel( +template +std::function)> element_gradient_kernel( signature, std::shared_ptr qf_derivatives, const int* elements, uint32_t num_elements) { - return [=](ExecArrayView K_elem) { + return [=](ExecArrayView K_elem) { using test_space = typename signature::return_type; using trial_space = typename std::tuple_element::type; - element_gradient_kernel(K_elem, qf_derivatives.get(), elements, num_elements); + element_gradient_kernel(K_elem, qf_derivatives.get(), elements, + num_elements); }; } diff --git a/src/serac/numerics/functional/element_restriction.cpp b/src/serac/numerics/functional/element_restriction.cpp index 9068da405..4fe638f35 100644 --- a/src/serac/numerics/functional/element_restriction.cpp +++ b/src/serac/numerics/functional/element_restriction.cpp @@ -1,10 +1,18 @@ +// Copyright (c) 2019-2023, Lawrence Livermore National Security, LLC and +// other Serac Project Developers. See the top-level LICENSE file for +// details. +// +// SPDX-License-Identifier: (BSD-3-Clause) + +#include "serac/serac_config.hpp" +#include "serac/infrastructure/accelerator.hpp" #include "serac/numerics/functional/element_restriction.hpp" #include "mfem.hpp" #include "serac/numerics/functional/geometry.hpp" -std::vector > lexicographic_permutations(int p) +std::vector> lexicographic_permutations(int p) { // p == 0 is admissible for L2 spaces, but lexicographic permutations // aren't needed in that corner case @@ -12,7 +20,7 @@ std::vector > lexicographic_permutations(int p) return {}; } - std::vector > output(mfem::Geometry::Type::NUM_GEOMETRIES); + std::vector> output(mfem::Geometry::Type::NUM_GEOMETRIES); { auto P = mfem::H1_SegmentElement(p).GetLexicographicOrdering(); @@ -126,7 +134,7 @@ Array2D face_permutations(mfem::Geometry::Type geom, int p) exit(1); } -std::vector > geom_local_face_dofs(int p) +std::vector> geom_local_face_dofs(int p) { // FullSimplify[InterpolatingPolynomial[{ // {{0, 2}, (p + 1) + p}, @@ -153,7 +161,7 @@ std::vector > geom_local_face_dofs(int p) auto hex_id = [p](int x, int y, int z) { return (p + 1) * ((p + 1) * z + y) + x; }; - std::vector > output(mfem::Geometry::Type::NUM_GEOMETRIES); + std::vector> output(mfem::Geometry::Type::NUM_GEOMETRIES); Array2D tris(3, p + 1); for (int k = 0; k <= p; k++) { @@ -220,8 +228,8 @@ axom::Array GetElementRestriction(const mfem::F mfem::Mesh* mesh = fes->GetMesh(); // note: this assumes that all the elements are the same polynomial order - int p = fes->GetElementOrder(0); - std::vector > lex_perm = lexicographic_permutations(p); + int p = fes->GetElementOrder(0); + std::vector> lex_perm = lexicographic_permutations(p); uint64_t n = 0; @@ -284,10 +292,10 @@ axom::Array GetFaceDofs(const mfem::FiniteEleme mfem::Table* face_to_elem = mesh->GetFaceToElementTable(); // note: this assumes that all the elements are the same polynomial order - int p = fes->GetElementOrder(0); - Array2D face_perm = face_permutations(face_geom, p); - std::vector > local_face_dofs = geom_local_face_dofs(p); - std::vector > lex_perm = lexicographic_permutations(p); + int p = fes->GetElementOrder(0); + Array2D face_perm = face_permutations(face_geom, p); + std::vector> local_face_dofs = geom_local_face_dofs(p); + std::vector> lex_perm = lexicographic_permutations(p); uint64_t n = 0; @@ -440,28 +448,11 @@ void ElementRestriction::GetElementVDofs(int i, std::vector& vdofs) const } } -void ElementRestriction::Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const +void ElementRestriction::GetElementVDofs(int i, DoF* vdofs) const { - for (uint64_t i = 0; i < num_elements; i++) { - for (uint64_t c = 0; c < components; c++) { - for (uint64_t j = 0; j < nodes_per_elem; j++) { - uint64_t E_id = (i * components + c) * nodes_per_elem + j; - uint64_t L_id = GetVDof(dof_info(i, j), c).index(); - E_vector[int(E_id)] = L_vector[int(L_id)]; - } - } - } -} - -void ElementRestriction::ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const -{ - for (uint64_t i = 0; i < num_elements; i++) { - for (uint64_t c = 0; c < components; c++) { - for (uint64_t j = 0; j < nodes_per_elem; j++) { - uint64_t E_id = (i * components + c) * nodes_per_elem + j; - uint64_t L_id = GetVDof(dof_info(i, j), c).index(); - L_vector[int(L_id)] += E_vector[int(E_id)]; - } + for (uint64_t c = 0; c < components; c++) { + for (uint64_t j = 0; j < nodes_per_elem; j++) { + vdofs[c * nodes_per_elem + j] = GetVDof(dof_info(i, j), c); } } } @@ -521,18 +512,4 @@ mfem::Array BlockElementRestriction::bOffsets() const return offsets; }; -void BlockElementRestriction::Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const -{ - for (auto [geom, restriction] : restrictions) { - restriction.Gather(L_vector, E_block_vector.GetBlock(geom)); - } -} - -void BlockElementRestriction::ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const -{ - for (auto [geom, restriction] : restrictions) { - restriction.ScatterAdd(E_block_vector.GetBlock(geom), L_vector); - } -} - } // namespace serac diff --git a/src/serac/numerics/functional/element_restriction.hpp b/src/serac/numerics/functional/element_restriction.hpp index 7f12dca59..796311075 100644 --- a/src/serac/numerics/functional/element_restriction.hpp +++ b/src/serac/numerics/functional/element_restriction.hpp @@ -5,6 +5,7 @@ #include "mfem.hpp" #include "axom/core.hpp" #include "geometry.hpp" +#include "serac/infrastructure/accelerator.hpp" inline bool isH1(const mfem::FiniteElementSpace& fes) { @@ -55,28 +56,29 @@ struct DoF { uint64_t bits; /// default ctor - DoF() : bits{} {} + SERAC_HOST_DEVICE DoF() : bits{} {} /// copy ctor - DoF(const DoF& other) : bits{other.bits} {} + SERAC_HOST_DEVICE DoF(const DoF& other) : bits{other.bits} {} /// create a `DoF` from the given index, sign and orientation values - DoF(uint64_t index, uint64_t sign = 0, uint64_t orientation = 0) + SERAC_HOST_DEVICE DoF(uint64_t index, uint64_t sign = 0, uint64_t orientation = 0) : bits((sign & 0x1ULL << sign_shift) + ((orientation & 0x7ULL) << orientation_shift) + index) { } /// copy assignment operator - void operator=(const DoF& other) { bits = other.bits; } + SERAC_HOST_DEVICE void operator=(const DoF& other) { bits = other.bits; } /// get the sign field of this `DoF` - int sign() const { return (bits & sign_mask) ? -1 : 1; } + SERAC_HOST_DEVICE int sign() const { return (bits & sign_mask) ? -1 : 1; } /// get the orientation field of this `DoF` - uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); } + SERAC_HOST_DEVICE uint64_t orientation() const { return ((bits & orientation_mask) >> orientation_shift); } /// get the index field of this `DoF` - uint64_t index() const { return (bits & index_mask); } + + SERAC_HOST_DEVICE uint64_t index() const { return (bits & index_mask); } }; /// a small struct used to enable range-based for loops in `Array2D` @@ -167,14 +169,95 @@ struct ElementRestriction { */ void GetElementVDofs(int i, std::vector& dofs) const; + /** + * @brief Overload for device code. + * + * @param i the index of the element + * @param vdofs (output) the DoFs associated with element `i` + */ + void GetElementVDofs(int i, DoF* vdofs) const; + /// get the dof information for a given node / component + DoF GetVDof(DoF node, uint64_t component) const; + /** + * @brief Static version of GetVDof for device code. + * + * @param node to get vdofs + * @param ordering_ elemtn ordering type to be provided from ElementRestriction struct + * @param component component idx to be provided from ElementRestriction struct + * @param num_nodes_ number of nodes in ElementRestriction + * @param components_ ordering to be provided from ElementRestriction struct + */ + SERAC_HOST_DEVICE static DoF GetVDofDevice(DoF node, mfem::Ordering::Type ordering_, uint64_t component, + uint64_t num_nodes_, uint64_t components_) + { + if (ordering_ == mfem::Ordering::Type::byNODES) { + return DoF{component * num_nodes_ + node.index(), (node.sign() == 1) ? 0ull : 1ull, node.orientation()}; + } else { + return DoF{node.index() * components_ + component, (node.sign() == 1) ? 0ull : 1ull, node.orientation()}; + } + } + /// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector" - void Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const; + template + void Gather(const mfem::Vector& L_vector, mfem::Vector& E_vector) const + { + auto l_ptr = L_vector.Read(); + auto e_ptr = E_vector.ReadWrite(); + + uint64_t d_components = components; + uint64_t d_nodes_per_elem = nodes_per_elem; + uint64_t d_num_nodes = num_nodes; + auto d_ordering = ordering; + constexpr axom::MemorySpace mem_space = detail::execution_to_memory::value; + axom::Array tmp = dof_info; + axom::ArrayView d_dof_info = tmp; + + RAJA::forall::forall_t>( + RAJA::TypedRangeSegment(0, num_elements), [d_components, d_nodes_per_elem, d_num_nodes, d_ordering, + e_ptr, l_ptr, d_dof_info] SERAC_HOST_DEVICE(uint64_t i) { + for (uint64_t c = 0; c < d_components; c++) { + for (uint64_t j = 0; j < d_nodes_per_elem; j++) { + uint64_t E_id = (i * d_components + c) * d_nodes_per_elem + j; + uint64_t L_id = GetVDofDevice(d_dof_info(i, j), d_ordering, c, d_num_nodes, d_components).index(); + e_ptr[int(E_id)] = l_ptr[int(L_id)]; + } + } + }); + } /// "E->L" in mfem parlance, each element scatter-adds its local vector into the appropriate place in the "L-vector" - void ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const; + template + void ScatterAdd(const mfem::Vector& E_vector, mfem::Vector& L_vector) const + { + auto l_ptr = L_vector.ReadWrite(true); + auto e_ptr = E_vector.Read(true); + + uint64_t d_components = components; + uint64_t d_nodes_per_elem = nodes_per_elem; + uint64_t d_num_nodes = num_nodes; + uint64_t d_num_elements = num_elements; + auto d_ordering = ordering; + constexpr axom::MemorySpace mem_space = detail::execution_to_memory::value; + axom::Array tmp = dof_info; + axom::ArrayView d_dof_info = tmp; + + RAJA::forall::forall_t>( + RAJA::TypedRangeSegment(0, 1), [d_components, d_nodes_per_elem, d_num_elements, d_num_nodes, + d_ordering, e_ptr, l_ptr, d_dof_info] SERAC_HOST_DEVICE(uint64_t) { + for (uint64_t i = 0; i < d_num_elements; ++i) { + for (uint64_t c = 0; c < d_components; c++) { + for (uint64_t j = 0; j < d_nodes_per_elem; j++) { + uint64_t E_id = (i * d_components + c) * d_nodes_per_elem + j; + uint64_t L_id = GetVDofDevice(d_dof_info(i, j), d_ordering, c, d_num_nodes, d_components).index(); + l_ptr[int(L_id)] += e_ptr[int(E_id)]; + } + } + } + }); + } /// the size of the "E-vector" uint64_t esize; @@ -206,6 +289,7 @@ struct ElementRestriction { * Instead of doing the "E->L" (gather) and "L->E" (scatter) operations for only one element geometry, this * class does them with block "E-vectors", where each element geometry is a separate block. */ +// template struct BlockElementRestriction { /// default ctor leaves this object uninitialized BlockElementRestriction() {} @@ -226,10 +310,22 @@ struct BlockElementRestriction { mfem::Array bOffsets() const; /// "L->E" in mfem parlance, each element gathers the values that belong to it, and stores them in the "E-vector" - void Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const; + template + void Gather(const mfem::Vector& L_vector, mfem::BlockVector& E_block_vector) const + { + for (auto [geom, restriction] : restrictions) { + restriction.template Gather(L_vector, E_block_vector.GetBlock(geom)); + } + } /// "E->L" in mfem parlance, each element scatter-adds its local vector into the appropriate place in the "L-vector" - void ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const; + template + void ScatterAdd(const mfem::BlockVector& E_block_vector, mfem::Vector& L_vector) const + { + for (auto [geom, restriction] : restrictions) { + restriction.template ScatterAdd(E_block_vector.GetBlock(geom), L_vector); + } + } /// the individual ElementRestriction operators for each element geometry std::map restrictions; diff --git a/src/serac/numerics/functional/finite_element.hpp b/src/serac/numerics/functional/finite_element.hpp index 12b87f684..d0d4e88fd 100644 --- a/src/serac/numerics/functional/finite_element.hpp +++ b/src/serac/numerics/functional/finite_element.hpp @@ -12,10 +12,13 @@ */ #pragma once +#include "serac/infrastructure/accelerator.hpp" +#include "serac/serac_config.hpp" #include "tuple.hpp" #include "tensor.hpp" #include "geometry.hpp" #include "polynomials.hpp" +#include "RAJA/RAJA.hpp" namespace serac { @@ -40,10 +43,6 @@ struct TensorProductQuadratureRule { } }; -template -struct CompileTimeValue { -}; - /** * @brief this struct is used to look up mfem's memory layout of * the quadrature point jacobian matrices @@ -241,19 +240,28 @@ struct QOI { * @tparam q how many values need to be transformed * @tparam dim the spatial dimension * @param qf_input the values to be transformed from parent to physical space - * @param jacobians the jacobians of the isoparametric map from parent to physical space of each quadrature point + * @param jacobians array of the jacobians of the isoparametric map from parent to physical space of each quadrature + * point + * @param block_idx index into the array of jacobians + * @param ctx the RAJA launch context used to synchronize threads and required by the RAJA API. */ -template -SERAC_HOST_DEVICE void parent_to_physical(tensor& qf_input, const tensor& jacobians) +template +SERAC_HOST_DEVICE void parent_to_physical(tensor& qf_input, const tensor* jacobians, + uint32_t block_idx, RAJA::LaunchContext ctx) { [[maybe_unused]] constexpr int VALUE = 0; [[maybe_unused]] constexpr int DERIVATIVE = 1; - for (int k = 0; k < q; k++) { + RAJA::RangeSegment k_range(0, BLOCK_SZ); + + RAJA::loop::threads_t>(ctx, k_range, [&](int k) { + if (k >= q) { + return; + } tensor J; for (int row = 0; row < dim; row++) { for (int col = 0; col < dim; col++) { - J[row][col] = jacobians(col, row, k); + J[row][col] = jacobians[block_idx](col, row, k); } } @@ -269,7 +277,7 @@ SERAC_HOST_DEVICE void parent_to_physical(tensor& qf_input, const tensor(qf_input[k]) = dot(get(qf_input[k]), transpose(J)); } } - } + }); } /** @@ -282,19 +290,27 @@ SERAC_HOST_DEVICE void parent_to_physical(tensor& qf_input, const tensor -SERAC_HOST_DEVICE void physical_to_parent(tensor& qf_output, const tensor& jacobians) +template +SERAC_HOST_DEVICE void physical_to_parent(tensor& qf_output, const tensor* jacobians, + uint32_t block_idx, RAJA::LaunchContext ctx) { [[maybe_unused]] constexpr int SOURCE = 0; [[maybe_unused]] constexpr int FLUX = 1; - for (int k = 0; k < q; k++) { + RAJA::RangeSegment k_range(0, BLOCK_SZ); + RAJA::loop::threads_t>(ctx, k_range, [&](int k) { + if (k >= q) { + return; + } tensor J_T; for (int row = 0; row < dim; row++) { for (int col = 0; col < dim; col++) { - J_T[row][col] = jacobians(row, col, k); + J_T[row][col] = jacobians[block_idx](row, col, k); } } @@ -318,7 +334,7 @@ SERAC_HOST_DEVICE void physical_to_parent(tensor& qf_output, const tensor< if constexpr (f == Family::QOI) { qf_output[k] = qf_output[k] * dv; } - } + }); } /** @@ -346,7 +362,7 @@ SERAC_HOST_DEVICE void physical_to_parent(tensor& qf_output, const tensor< * }; * */ -template +template struct finite_element; #include "detail/segment_H1.inl" diff --git a/src/serac/numerics/functional/function_signature.hpp b/src/serac/numerics/functional/function_signature.hpp index e5ce5d9bb..628090f07 100644 --- a/src/serac/numerics/functional/function_signature.hpp +++ b/src/serac/numerics/functional/function_signature.hpp @@ -1,5 +1,14 @@ +// Copyright (c) 2019-2023, Lawrence Livermore National Security, LLC and +// other Serac Project Developers. See the top-level LICENSE file for +// details. +// +// SPDX-License-Identifier: (BSD-3-Clause) + #pragma once -#include +#include "camp/camp.hpp" + +#include "serac/infrastructure/accelerator.hpp" +#include "serac/serac_config.hpp" #include "serac/numerics/functional/finite_element.hpp" template @@ -32,10 +41,10 @@ struct FunctionSignature { * template parameters in evaluation_kernel_impl. See section 14.7.3 of the * CUDA programming guide, item 9. */ -template +template auto trial_elements_tuple(FunctionSignature) { - return serac::tuple...>{}; + return serac::tuple...>{}; } /** @@ -45,8 +54,8 @@ auto trial_elements_tuple(FunctionSignature) * See the comments for trial_elements_tuple to understand why this is needed in * domain_integral_kernels::evaluation_kernel and boundary_integral_kernels::evaluation_kernel. */ -template +template auto get_test_element(FunctionSignature) { - return serac::finite_element{}; + return serac::finite_element{}; } diff --git a/src/serac/numerics/functional/functional.hpp b/src/serac/numerics/functional/functional.hpp index 402cab9b3..cb713fa45 100644 --- a/src/serac/numerics/functional/functional.hpp +++ b/src/serac/numerics/functional/functional.hpp @@ -14,6 +14,7 @@ #include "mfem.hpp" +#include "serac/infrastructure/accelerator.hpp" #include "serac/serac_config.hpp" #include "serac/infrastructure/logger.hpp" #include "serac/numerics/functional/tensor.hpp" @@ -27,7 +28,10 @@ #include "serac/numerics/functional/domain.hpp" +#include +#include #include +#include #include namespace serac { @@ -142,7 +146,7 @@ generateParFiniteElementSpace(mfem::ParMesh* mesh) } /// @cond -template +template class Functional; /// @endcond @@ -295,8 +299,8 @@ class Functional { check_for_missing_nodal_gridfunc(domain); using signature = test(decltype(serac::type(trial_spaces))...); - integrals_.push_back( - MakeDomainIntegral(EntireDomain(domain), integrand, qdata, std::vector{args...})); + integrals_.push_back(MakeDomainIntegral(EntireDomain(domain), integrand, qdata, + std::vector{args...})); } /// @overload @@ -312,8 +316,8 @@ class Functional { check_for_missing_nodal_gridfunc(domain.mesh_); using signature = test(decltype(serac::type(trial_spaces))...); - integrals_.push_back( - MakeDomainIntegral(domain, integrand, qdata, std::vector{args...})); + integrals_.push_back(MakeDomainIntegral(std::move(domain), integrand, qdata, + std::vector{args...})); } /** @@ -334,8 +338,8 @@ class Functional { check_for_missing_nodal_gridfunc(domain); using signature = test(decltype(serac::type(trial_spaces))...); - integrals_.push_back( - MakeBoundaryIntegral(EntireBoundary(domain), integrand, std::vector{args...})); + integrals_.push_back(MakeBoundaryIntegral(EntireBoundary(domain), integrand, + std::vector{args...})); } /// @overload @@ -350,7 +354,8 @@ class Functional { check_for_missing_nodal_gridfunc(domain.mesh_); using signature = test(decltype(serac::type(trial_spaces))...); - integrals_.push_back(MakeBoundaryIntegral(domain, integrand, std::vector{args...})); + integrals_.push_back( + MakeBoundaryIntegral(std::move(domain), integrand, std::vector{args...})); } /** @@ -417,14 +422,14 @@ class Functional { auto type = integral.domain_.type_; if (!already_computed[type]) { - G_trial_[type][which].Gather(input_L_[which], input_E_[type][which]); + (G_trial_[type][which]).template Gather(input_L_[which], input_E_[type][which]); already_computed[type] = true; } integral.GradientMult(input_E_[type][which], output_E_[type], which); // scatter-add to compute residuals on the local processor - G_test_[type].ScatterAdd(output_E_[type], output_L_); + G_test_[type].template ScatterAdd(output_E_[type], output_L_); } // scatter-add to compute global residuals @@ -464,7 +469,7 @@ class Functional { for (auto i : integral.active_trial_spaces_) { if (!already_computed[type][i]) { - G_trial_[type][i].Gather(input_L_[i], input_E_[type][i]); + G_trial_[type][i].template Gather(input_L_[i], input_E_[type][i]); already_computed[type][i] = true; } } @@ -472,7 +477,7 @@ class Functional { integral.Mult(t, input_E_[type], output_E_[type], wrt, update_qdata_); // scatter-add to compute residuals on the local processor - G_test_[type].ScatterAdd(output_E_[type], output_L_); + G_test_[type].template ScatterAdd(output_E_[type], output_L_); } // scatter-add to compute global residuals @@ -616,7 +621,7 @@ class Functional { for (auto [geom, elem_matrices] : K_elem) { std::vector test_vdofs(test_restrictions[geom].nodes_per_elem * test_restrictions[geom].components); std::vector trial_vdofs(trial_restrictions[geom].nodes_per_elem * trial_restrictions[geom].components); - + axom::Array elem_matrices_host = elem_matrices; for (axom::IndexType e = 0; e < elem_matrices.shape()[0]; e++) { test_restrictions[geom].GetElementVDofs(e, test_vdofs); trial_restrictions[geom].GetElementVDofs(e, trial_vdofs); @@ -634,8 +639,7 @@ class Functional { // // This is kind of confusing, and will be fixed in a future refactor // of the element gradient kernel implementation - [[maybe_unused]] auto nz = lookup_tables(row, col); - values[lookup_tables(row, col)] += sign * elem_matrices(e, i, j); + values[lookup_tables(row, col)] += sign * elem_matrices_host(e, i, j); } } } @@ -726,7 +730,7 @@ class Functional { mutable std::vector input_E_[Domain::num_types]; - std::vector integrals_; + std::vector> integrals_; mutable mfem::BlockVector output_E_[Domain::num_types]; diff --git a/src/serac/numerics/functional/functional_qoi.inl b/src/serac/numerics/functional/functional_qoi.inl index 2c0cea577..79a46f1dd 100644 --- a/src/serac/numerics/functional/functional_qoi.inl +++ b/src/serac/numerics/functional/functional_qoi.inl @@ -168,8 +168,8 @@ public: check_for_missing_nodal_gridfunc(mesh); using signature = test(decltype(serac::type(trial_spaces))...); - integrals_.push_back( - MakeDomainIntegral(EntireDomain(mesh), integrand, qdata, std::vector{args...})); + integrals_.push_back(MakeDomainIntegral(EntireDomain(mesh), integrand, qdata, + std::vector{args...})); } /// @overload @@ -186,7 +186,7 @@ public: using signature = test(decltype(serac::type(trial_spaces))...); integrals_.push_back( - MakeDomainIntegral(domain, integrand, qdata, std::vector{args...})); + MakeDomainIntegral(domain, integrand, qdata, std::vector{args...})); } /** @@ -210,7 +210,7 @@ public: using signature = test(decltype(serac::type(trial_spaces))...); integrals_.push_back( - MakeBoundaryIntegral(EntireBoundary(mesh), integrand, std::vector{args...})); + MakeBoundaryIntegral(EntireBoundary(mesh), integrand, std::vector{args...})); } /// @overload @@ -225,7 +225,8 @@ public: check_for_missing_nodal_gridfunc(domain.mesh_); using signature = test(decltype(serac::type(trial_spaces))...); - integrals_.push_back(MakeBoundaryIntegral(domain, integrand, std::vector{args...})); + integrals_.push_back( + MakeBoundaryIntegral(domain, integrand, std::vector{args...})); } /** @@ -293,7 +294,7 @@ public: auto type = integral.domain_.type_; if (!already_computed[type]) { - G_trial_[type][which].Gather(input_L_[which], input_E_[type][which]); + G_trial_[type][which].template Gather(input_L_[which], input_E_[type][which]); already_computed[type] = true; } @@ -340,7 +341,7 @@ public: for (auto i : integral.active_trial_spaces_) { if (!already_computed[type][i]) { - G_trial_[type][i].Gather(input_L_[i], input_E_[type][i]); + G_trial_[type][i].template Gather(input_L_[i], input_E_[type][i]); already_computed[type][i] = true; } } @@ -507,7 +508,7 @@ private: mutable std::vector input_E_[Domain::num_types]; - std::vector integrals_; + std::vector> integrals_; mutable mfem::BlockVector output_E_[Domain::num_types]; diff --git a/src/serac/numerics/functional/geometric_factors.cpp b/src/serac/numerics/functional/geometric_factors.cpp deleted file mode 100644 index 7e8897fa5..000000000 --- a/src/serac/numerics/functional/geometric_factors.cpp +++ /dev/null @@ -1,222 +0,0 @@ -#include "serac/numerics/functional/geometric_factors.hpp" -#include "serac/numerics/functional/finite_element.hpp" - -namespace serac { - -/** - * @brief a kernel to compute the positions and jacobians at each quadrature point (mfem calls this "geometric factors") - * @tparam Q a parameter controlling the number of quadrature points an element - * @tparam function_space the polynomial order and kind of function space used to interpolate - * @tparam geom the element geometry - * @param positions_q (output) the positions for each quadrature point - * @param jacobians_q (output) the jacobians for each quadrature point - * @param positions_e (input) the "e-vector" of position data - * @param elements (input) the list of element indices that are part of this domain - */ -template -void compute_geometric_factors(mfem::Vector& positions_q, mfem::Vector& jacobians_q, const mfem::Vector& positions_e, - const std::vector& elements) -{ - static constexpr TensorProductQuadratureRule rule{}; - - constexpr int spatial_dim = function_space::components; - constexpr int geometry_dim = dimension_of(geom); - constexpr int qpts_per_elem = num_quadrature_points(geom, Q); - - using element_type = finite_element; - using position_type = tensor; - using jacobian_type = tensor; - - auto X_q = reinterpret_cast(positions_q.ReadWrite()); - auto J_q = reinterpret_cast(jacobians_q.ReadWrite()); - auto X = reinterpret_cast(positions_e.Read()); - - std::size_t num_elements = elements.size(); - - // for each element in the domain - for (uint32_t e = 0; e < num_elements; e++) { - // load the positions for the nodes in this element - auto X_e = X[elements[e]]; - - // calculate the values and derivatives (w.r.t. xi) of X at each quadrature point - auto quadrature_values = element_type::interpolate(X_e, rule); - - // mfem wants to store this data in a different layout, so we have to transpose it - for (int q = 0; q < qpts_per_elem; q++) { - auto [value, gradient] = quadrature_values[q]; - for (int i = 0; i < spatial_dim; i++) { - X_q[e](i, q) = value[i]; - if constexpr (std::is_same_v) { - J_q[e](0, i, q) = gradient[i]; - } - if constexpr (!std::is_same_v) { - for (int j = 0; j < geometry_dim; j++) { - J_q[e](j, i, q) = gradient(i, j); - } - } - } - } - } -} - -GeometricFactors::GeometricFactors(const Domain& d, int q, mfem::Geometry::Type g) -{ - auto* nodes = d.mesh_.GetNodes(); - auto* fes = nodes->FESpace(); - - auto restriction = serac::ElementRestriction(fes, g); - mfem::Vector X_e(int(restriction.ESize())); - restriction.Gather(*nodes, X_e); - - // assumes all elements are the same order - int p = fes->GetElementOrder(0); - - int spatial_dim = d.mesh_.SpaceDimension(); - int geometry_dim = dimension_of(g); - int qpts_per_elem = num_quadrature_points(g, q); - - if (g == mfem::Geometry::TRIANGLE) elements = d.tri_ids_; - if (g == mfem::Geometry::SQUARE) elements = d.quad_ids_; - if (g == mfem::Geometry::TETRAHEDRON) elements = d.tet_ids_; - if (g == mfem::Geometry::CUBE) elements = d.hex_ids_; - - num_elements = elements.size(); - - X = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim); - J = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim * geometry_dim); - -#define DISPATCH_KERNEL(GEOM, P, Q) \ - if (g == mfem::Geometry::GEOM && p == P && q == Q) { \ - compute_geometric_factors >(X, J, X_e, \ - elements); \ - return; \ - } - - DISPATCH_KERNEL(TRIANGLE, 1, 1); - DISPATCH_KERNEL(TRIANGLE, 1, 2); - DISPATCH_KERNEL(TRIANGLE, 1, 3); - DISPATCH_KERNEL(TRIANGLE, 1, 4); - - DISPATCH_KERNEL(SQUARE, 1, 1); - DISPATCH_KERNEL(SQUARE, 1, 2); - DISPATCH_KERNEL(SQUARE, 1, 3); - DISPATCH_KERNEL(SQUARE, 1, 4); - - DISPATCH_KERNEL(SQUARE, 2, 1); - DISPATCH_KERNEL(SQUARE, 2, 2); - DISPATCH_KERNEL(SQUARE, 2, 3); - DISPATCH_KERNEL(SQUARE, 2, 4); - - DISPATCH_KERNEL(SQUARE, 3, 1); - DISPATCH_KERNEL(SQUARE, 3, 2); - DISPATCH_KERNEL(SQUARE, 3, 3); - DISPATCH_KERNEL(SQUARE, 3, 4); - - DISPATCH_KERNEL(TETRAHEDRON, 1, 1); - DISPATCH_KERNEL(TETRAHEDRON, 1, 2); - DISPATCH_KERNEL(TETRAHEDRON, 1, 3); - DISPATCH_KERNEL(TETRAHEDRON, 1, 4); - - DISPATCH_KERNEL(CUBE, 1, 1); - DISPATCH_KERNEL(CUBE, 1, 2); - DISPATCH_KERNEL(CUBE, 1, 3); - DISPATCH_KERNEL(CUBE, 1, 4); - - DISPATCH_KERNEL(CUBE, 2, 1); - DISPATCH_KERNEL(CUBE, 2, 2); - DISPATCH_KERNEL(CUBE, 2, 3); - DISPATCH_KERNEL(CUBE, 2, 4); - - DISPATCH_KERNEL(CUBE, 3, 1); - DISPATCH_KERNEL(CUBE, 3, 2); - DISPATCH_KERNEL(CUBE, 3, 3); - DISPATCH_KERNEL(CUBE, 3, 4); - -#undef DISPATCH_KERNEL - - std::cout << "should never be reached " << std::endl; -} - -GeometricFactors::GeometricFactors(const Domain& d, int q, mfem::Geometry::Type g, FaceType type) -{ - auto* nodes = d.mesh_.GetNodes(); - auto* fes = nodes->FESpace(); - - auto restriction = serac::ElementRestriction(fes, g, type); - mfem::Vector X_e(int(restriction.ESize())); - restriction.Gather(*nodes, X_e); - - // assumes all elements are the same order - int p = fes->GetElementOrder(0); - - int spatial_dim = d.mesh_.SpaceDimension(); - int geometry_dim = dimension_of(g); - int qpts_per_elem = num_quadrature_points(g, q); - - // NB: we only want the number of elements with the specified - // geometry, which is not the same as mesh->GetNE() in general - elements = d.get(g); - - num_elements = std::size_t(elements.size()); - - X = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim); - J = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim * geometry_dim); - -#define DISPATCH_KERNEL(GEOM, P, Q) \ - if (g == mfem::Geometry::GEOM && p == P && q == Q) { \ - compute_geometric_factors >(X, J, X_e, \ - elements); \ - return; \ - } - - DISPATCH_KERNEL(SEGMENT, 1, 1); - DISPATCH_KERNEL(SEGMENT, 1, 2); - DISPATCH_KERNEL(SEGMENT, 1, 3); - DISPATCH_KERNEL(SEGMENT, 1, 4); - - DISPATCH_KERNEL(SEGMENT, 2, 1); - DISPATCH_KERNEL(SEGMENT, 2, 2); - DISPATCH_KERNEL(SEGMENT, 2, 3); - DISPATCH_KERNEL(SEGMENT, 2, 4); - - DISPATCH_KERNEL(SEGMENT, 3, 1); - DISPATCH_KERNEL(SEGMENT, 3, 2); - DISPATCH_KERNEL(SEGMENT, 3, 3); - DISPATCH_KERNEL(SEGMENT, 3, 4); - - DISPATCH_KERNEL(TRIANGLE, 1, 1); - DISPATCH_KERNEL(TRIANGLE, 1, 2); - DISPATCH_KERNEL(TRIANGLE, 1, 3); - DISPATCH_KERNEL(TRIANGLE, 1, 4); - - DISPATCH_KERNEL(TRIANGLE, 2, 1); - DISPATCH_KERNEL(TRIANGLE, 2, 2); - DISPATCH_KERNEL(TRIANGLE, 2, 3); - DISPATCH_KERNEL(TRIANGLE, 2, 4); - - DISPATCH_KERNEL(TRIANGLE, 3, 1); - DISPATCH_KERNEL(TRIANGLE, 3, 2); - DISPATCH_KERNEL(TRIANGLE, 3, 3); - DISPATCH_KERNEL(TRIANGLE, 3, 4); - - DISPATCH_KERNEL(SQUARE, 1, 1); - DISPATCH_KERNEL(SQUARE, 1, 2); - DISPATCH_KERNEL(SQUARE, 1, 3); - DISPATCH_KERNEL(SQUARE, 1, 4); - - DISPATCH_KERNEL(SQUARE, 2, 1); - DISPATCH_KERNEL(SQUARE, 2, 2); - DISPATCH_KERNEL(SQUARE, 2, 3); - DISPATCH_KERNEL(SQUARE, 2, 4); - - DISPATCH_KERNEL(SQUARE, 3, 1); - DISPATCH_KERNEL(SQUARE, 3, 2); - DISPATCH_KERNEL(SQUARE, 3, 3); - DISPATCH_KERNEL(SQUARE, 3, 4); - -#undef DISPATCH_KERNEL - - std::cout << "should never be reached" << std::endl; -} - -} // namespace serac diff --git a/src/serac/numerics/functional/geometric_factors.hpp b/src/serac/numerics/functional/geometric_factors.hpp index 5d33ab256..18c0704e7 100644 --- a/src/serac/numerics/functional/geometric_factors.hpp +++ b/src/serac/numerics/functional/geometric_factors.hpp @@ -1,5 +1,6 @@ #pragma once +#include "serac/infrastructure/accelerator.hpp" #include "serac/numerics/functional/element_restriction.hpp" // for FaceType #include "serac/numerics/functional/finite_element.hpp" // for Geometry #include "serac/numerics/functional/domain.hpp" @@ -7,12 +8,73 @@ #include "mfem.hpp" namespace serac { +/** + * @brief a kernel to compute the positions and jacobians at each quadrature point (mfem calls this "geometric factors") + * @tparam Q a parameter controlling the number of quadrature points an element + * @tparam function_space the polynomial order and kind of function space used to interpolate + * @tparam geom the element geometry + * @param positions_q (output) the positions for each quadrature point + * @param jacobians_q (output) the jacobians for each quadrature point + * @param positions_e (input) the "e-vector" of position data + * @param elements (input) the list of element indices that are part of this domain + */ +template +void compute_geometric_factors(mfem::Vector& positions_q, mfem::Vector& jacobians_q, const mfem::Vector& positions_e, + const std::vector& elements) +{ + constexpr TensorProductQuadratureRule rule{}; + + constexpr int spatial_dim = function_space::components; + constexpr int geometry_dim = dimension_of(geom); + constexpr int qpts_per_elem = num_quadrature_points(geom, Q); + + using element_type = finite_element; + using position_type = tensor; + using jacobian_type = tensor; + + auto X_q = reinterpret_cast(positions_q.HostReadWrite()); + auto J_q = reinterpret_cast(jacobians_q.HostReadWrite()); + auto X = reinterpret_cast(positions_e.HostRead()); + + std::size_t num_elements = elements.size(); + + // for each element in the domain + using qf_inputs_type = decltype(element_type{}.template interpolate_output_helper()); + auto& rm = umpire::ResourceManager::getInstance(); + auto host_allocator = rm.getAllocator("HOST"); + qf_inputs_type* quadrature_values = static_cast(host_allocator.allocate(sizeof(qf_inputs_type))); + + for (uint32_t e = 0; e < num_elements; ++e) { + // load the positions for the nodes in this element + auto X_e = X[elements[e]]; + // calculate the values and derivatives (w.r.t. xi) of X at each quadrature point + element_type::interpolate(X_e, rule, quadrature_values, RAJA::LaunchContext{}); + + // mfem wants to store this data in a different layout, so we have to transpose it + for (int q = 0; q < qpts_per_elem; q++) { + auto [value, gradient] = (*quadrature_values)[q]; + for (int i = 0; i < spatial_dim; i++) { + X_q[e](i, q) = value[i]; + if constexpr (std::is_same_v) { + J_q[e](0, i, q) = gradient[i]; + } + if constexpr (!std::is_same_v) { + for (int j = 0; j < geometry_dim; j++) { + J_q[e](j, i, q) = gradient(i, j); + } + } + } + } + } + host_allocator.deallocate(quadrature_values); +} /** * @brief a class that computes and stores positions and jacobians at each quadrature point * @note analogous to mfem::GeometricFactors, except that it implements the position/jacobian * calculations on boundary elements and on simplex elements */ +template struct GeometricFactors { /// @brief default ctor, leaving this object uninitialized GeometricFactors(){}; @@ -21,22 +83,178 @@ struct GeometricFactors { * @brief calculate positions and jacobians for quadrature points belonging to * elements with the specified geometry, belonging to the provided mesh. * - * @param domain the domain of integration + * @param d the domain of integration * @param q a parameter controlling the number of quadrature points per element - * @param elem_geom which kind of element geometry to select + * @param g which kind of element geometry to select */ - GeometricFactors(const Domain& domain, int q, mfem::Geometry::Type elem_geom); + GeometricFactors(const Domain& d, int q, mfem::Geometry::Type g) + { + auto* nodes = d.mesh_.GetNodes(); + auto* fes = nodes->FESpace(); + + auto restriction = serac::ElementRestriction(fes, g); + mfem::Vector X_e(int(restriction.ESize())); + restriction.template Gather(*nodes, X_e); + + // assumes all elements are the same order + int p = fes->GetElementOrder(0); + + int spatial_dim = d.mesh_.SpaceDimension(); + int geometry_dim = dimension_of(g); + int qpts_per_elem = num_quadrature_points(g, q); + + if (g == mfem::Geometry::TRIANGLE) elements = d.tri_ids_; + if (g == mfem::Geometry::SQUARE) elements = d.quad_ids_; + if (g == mfem::Geometry::TETRAHEDRON) elements = d.tet_ids_; + if (g == mfem::Geometry::CUBE) elements = d.hex_ids_; + + num_elements = elements.size(); + + X = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim); + J = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim * geometry_dim); + +#define DISPATCH_KERNEL(GEOM, P, Q) \ + if (g == mfem::Geometry::GEOM && p == P && q == Q) { \ + compute_geometric_factors>(X, J, X_e, elements); \ + return; \ + } + + DISPATCH_KERNEL(TRIANGLE, 1, 1); + DISPATCH_KERNEL(TRIANGLE, 1, 2); + DISPATCH_KERNEL(TRIANGLE, 1, 3); + DISPATCH_KERNEL(TRIANGLE, 1, 4); + + DISPATCH_KERNEL(SQUARE, 1, 1); + DISPATCH_KERNEL(SQUARE, 1, 2); + DISPATCH_KERNEL(SQUARE, 1, 3); + DISPATCH_KERNEL(SQUARE, 1, 4); + + DISPATCH_KERNEL(SQUARE, 2, 1); + DISPATCH_KERNEL(SQUARE, 2, 2); + DISPATCH_KERNEL(SQUARE, 2, 3); + DISPATCH_KERNEL(SQUARE, 2, 4); + + DISPATCH_KERNEL(SQUARE, 3, 1); + DISPATCH_KERNEL(SQUARE, 3, 2); + DISPATCH_KERNEL(SQUARE, 3, 3); + DISPATCH_KERNEL(SQUARE, 3, 4); + + DISPATCH_KERNEL(TETRAHEDRON, 1, 1); + DISPATCH_KERNEL(TETRAHEDRON, 1, 2); + DISPATCH_KERNEL(TETRAHEDRON, 1, 3); + DISPATCH_KERNEL(TETRAHEDRON, 1, 4); + + DISPATCH_KERNEL(CUBE, 1, 1); + DISPATCH_KERNEL(CUBE, 1, 2); + DISPATCH_KERNEL(CUBE, 1, 3); + DISPATCH_KERNEL(CUBE, 1, 4); + + DISPATCH_KERNEL(CUBE, 2, 1); + DISPATCH_KERNEL(CUBE, 2, 2); + DISPATCH_KERNEL(CUBE, 2, 3); + DISPATCH_KERNEL(CUBE, 2, 4); + + DISPATCH_KERNEL(CUBE, 3, 1); + DISPATCH_KERNEL(CUBE, 3, 2); + DISPATCH_KERNEL(CUBE, 3, 3); + DISPATCH_KERNEL(CUBE, 3, 4); + +#undef DISPATCH_KERNEL + + std::cout << "should never be reached " << std::endl; + } /** * @brief calculate positions and jacobians for quadrature points belonging to * boundary elements with the specified geometry, belonging to the provided mesh. * - * @param domain the domain of integration + * @param d the domain of integration * @param q a parameter controlling the number of quadrature points per element - * @param elem_geom which kind of element geometry to select + * @param g which kind of element geometry to select * @param type whether or not the faces are on the boundary (supported) or interior (unsupported) */ - GeometricFactors(const Domain& domain, int q, mfem::Geometry::Type elem_geom, FaceType type); + GeometricFactors(const Domain& d, int q, mfem::Geometry::Type g, FaceType type) + { + auto* nodes = d.mesh_.GetNodes(); + auto* fes = nodes->FESpace(); + + auto restriction = serac::ElementRestriction(fes, g, type); + mfem::Vector X_e(int(restriction.ESize())); + restriction.template Gather(*nodes, X_e); + + // assumes all elements are the same order + int p = fes->GetElementOrder(0); + + int spatial_dim = d.mesh_.SpaceDimension(); + int geometry_dim = dimension_of(g); + int qpts_per_elem = num_quadrature_points(g, q); + + // NB: we only want the number of elements with the specified + // geometry, which is not the same as mesh->GetNE() in general + elements = d.get(g); + + num_elements = std::size_t(elements.size()); + + X = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim); + J = mfem::Vector(int(num_elements) * qpts_per_elem * spatial_dim * geometry_dim); + +#define DISPATCH_KERNEL(GEOM, P, Q) \ + if (g == mfem::Geometry::GEOM && p == P && q == Q) { \ + compute_geometric_factors>(X, J, X_e, \ + elements); \ + return; \ + } + + DISPATCH_KERNEL(SEGMENT, 1, 1); + DISPATCH_KERNEL(SEGMENT, 1, 2); + DISPATCH_KERNEL(SEGMENT, 1, 3); + DISPATCH_KERNEL(SEGMENT, 1, 4); + + DISPATCH_KERNEL(SEGMENT, 2, 1); + DISPATCH_KERNEL(SEGMENT, 2, 2); + DISPATCH_KERNEL(SEGMENT, 2, 3); + DISPATCH_KERNEL(SEGMENT, 2, 4); + + DISPATCH_KERNEL(SEGMENT, 3, 1); + DISPATCH_KERNEL(SEGMENT, 3, 2); + DISPATCH_KERNEL(SEGMENT, 3, 3); + DISPATCH_KERNEL(SEGMENT, 3, 4); + + DISPATCH_KERNEL(TRIANGLE, 1, 1); + DISPATCH_KERNEL(TRIANGLE, 1, 2); + DISPATCH_KERNEL(TRIANGLE, 1, 3); + DISPATCH_KERNEL(TRIANGLE, 1, 4); + + DISPATCH_KERNEL(TRIANGLE, 2, 1); + DISPATCH_KERNEL(TRIANGLE, 2, 2); + DISPATCH_KERNEL(TRIANGLE, 2, 3); + DISPATCH_KERNEL(TRIANGLE, 2, 4); + + DISPATCH_KERNEL(TRIANGLE, 3, 1); + DISPATCH_KERNEL(TRIANGLE, 3, 2); + DISPATCH_KERNEL(TRIANGLE, 3, 3); + DISPATCH_KERNEL(TRIANGLE, 3, 4); + + DISPATCH_KERNEL(SQUARE, 1, 1); + DISPATCH_KERNEL(SQUARE, 1, 2); + DISPATCH_KERNEL(SQUARE, 1, 3); + DISPATCH_KERNEL(SQUARE, 1, 4); + + DISPATCH_KERNEL(SQUARE, 2, 1); + DISPATCH_KERNEL(SQUARE, 2, 2); + DISPATCH_KERNEL(SQUARE, 2, 3); + DISPATCH_KERNEL(SQUARE, 2, 4); + + DISPATCH_KERNEL(SQUARE, 3, 1); + DISPATCH_KERNEL(SQUARE, 3, 2); + DISPATCH_KERNEL(SQUARE, 3, 3); + DISPATCH_KERNEL(SQUARE, 3, 4); + +#undef DISPATCH_KERNEL + + std::cout << "should never be reached" << std::endl; + } // descriptions copied from mfem diff --git a/src/serac/numerics/functional/integral.hpp b/src/serac/numerics/functional/integral.hpp index 337118a7d..88d6f95ba 100644 --- a/src/serac/numerics/functional/integral.hpp +++ b/src/serac/numerics/functional/integral.hpp @@ -11,6 +11,7 @@ #include "mfem.hpp" +#include "serac/serac_config.hpp" #include "serac/infrastructure/accelerator.hpp" #include "serac/numerics/functional/geometric_factors.hpp" #include "serac/numerics/functional/function_signature.hpp" @@ -21,6 +22,7 @@ namespace serac { /// @brief a class for representing a Integral calculations and their derivatives +template struct Integral { /// @brief the number of different kinds of integration domains static constexpr std::size_t num_types = 2; @@ -76,6 +78,7 @@ struct Integral { for (std::size_t i = 0; i < active_trial_spaces_.size(); i++) { inputs[i] = input_E[uint32_t(active_trial_spaces_[i])].GetBlock(geometry).Read(); } + output_E.GetBlock(geometry) = 0.0; func(t, inputs, output_E.GetBlock(geometry).ReadWrite(), update_state); } } @@ -97,6 +100,7 @@ struct Integral { // if this integral actually depends on the specified variable if (functional_to_integral_index_.count(differentiation_index) > 0) { for (auto& [geometry, func] : jvp_[functional_to_integral_index_.at(differentiation_index)]) { + output_E.GetBlock(geometry) = 0.0; func(input_E.GetBlock(geometry).Read(), output_E.GetBlock(geometry).ReadWrite()); } } @@ -109,8 +113,8 @@ struct Integral { * test_dofs_per_elem) * @param differentiation_index the index of the trial space being differentiated */ - void ComputeElementGradients(std::map >& K_e, - uint32_t differentiation_index) const + void ComputeElementGradients(std::map>& K_e, + uint32_t differentiation_index) const { // if this integral actually depends on the specified variable if (functional_to_integral_index_.count(differentiation_index) > 0) { @@ -130,19 +134,19 @@ struct Integral { std::map evaluation_; /// @brief kernels for integral evaluation + derivative w.r.t. specified argument over each type of element - std::vector > evaluation_with_AD_; + std::vector> evaluation_with_AD_; /// @brief signature of element jvp kernel using jacobian_vector_product_func = std::function; /// @brief kernels for jacobian-vector product of integral calculation - std::vector > jvp_; + std::vector> jvp_; /// @brief signature of element gradient kernel - using grad_func = std::function)>; + using grad_func = std::function)>; /// @brief kernels for calculation of element jacobians - std::vector > element_gradient_; + std::vector> element_gradient_; /// @brief a list of the trial spaces that take part in this integrand std::vector active_trial_spaces_; @@ -164,7 +168,7 @@ struct Integral { std::map functional_to_integral_index_; /// @brief the spatial positions and jacobians (dx_dxi) for each element type and quadrature point - std::map geometric_factors_; + std::map> geometric_factors_; }; /** @@ -181,23 +185,32 @@ struct Integral { * @param qf the quadrature function * @param qdata the values of any quadrature point data for the material */ -template -void generate_kernels(FunctionSignature s, Integral& integral, const lambda_type& qf, - std::shared_ptr > qdata) +template +void generate_kernels(FunctionSignature s, Integral& integral, const lambda_type& qf, + std::shared_ptr> qdata) { - integral.geometric_factors_[geom] = GeometricFactors(integral.domain_, Q, geom); - GeometricFactors& gf = integral.geometric_factors_[geom]; + integral.geometric_factors_[geom] = GeometricFactors(integral.domain_, Q, geom); + GeometricFactors& gf = integral.geometric_factors_[geom]; if (gf.num_elements == 0) return; - const double* positions = gf.X.Read(); - const double* jacobians = gf.J.Read(); - const int* elements = &integral.domain_.get(geom)[0]; + gf.X.UseDevice(true); + gf.J.UseDevice(true); + const double* positions = gf.X.Read(true); + const double* jacobians = gf.J.Read(true); const uint32_t num_elements = uint32_t(gf.num_elements); const uint32_t qpts_per_element = num_quadrature_points(geom, Q); std::shared_ptr dummy_derivatives; - integral.evaluation_[geom] = domain_integral::evaluation_kernel( + // Copy elements to device if using CUDA + const int* elements = nullptr; + if constexpr (exec == ExecutionSpace::GPU) { + elements = copy_data(const_cast(integral.domain_.get(geom).data()), + sizeof(int) * integral.domain_.get(geom).size(), "DEVICE"); + } else { + elements = integral.domain_.get(geom).data(); + } + integral.evaluation_[geom] = domain_integral::evaluation_kernel( s, qf, positions, jacobians, qdata, dummy_derivatives, elements, num_elements); constexpr std::size_t num_args = s.num_args; @@ -209,15 +222,16 @@ void generate_kernels(FunctionSignature s, Integral& integral, // action_of_gradient functor below to augment the reference count, and extend its lifetime to match // that of the DomainIntegral that allocated it. using derivative_type = decltype(domain_integral::get_derivative_type(qf, qpt_data_type{})); - auto ptr = accelerator::make_shared_array(num_elements * qpts_per_element); - integral.evaluation_with_AD_[index][geom] = domain_integral::evaluation_kernel( + auto ptr = accelerator::make_shared_array(num_elements * qpts_per_element); + + integral.evaluation_with_AD_[index][geom] = domain_integral::evaluation_kernel( s, qf, positions, jacobians, qdata, ptr, elements, num_elements); integral.jvp_[index][geom] = - domain_integral::jacobian_vector_product_kernel(s, ptr, elements, num_elements); + domain_integral::jacobian_vector_product_kernel(s, ptr, elements, num_elements); integral.element_gradient_[index][geom] = - domain_integral::element_gradient_kernel(s, ptr, elements, num_elements); + domain_integral::element_gradient_kernel(s, ptr, elements, num_elements); }); } @@ -235,25 +249,25 @@ void generate_kernels(FunctionSignature s, Integral& integral, * @param argument_indices the indices of trial space arguments used in the Integral * @return Integral the initialized `Integral` object */ -template -Integral MakeDomainIntegral(const Domain& domain, const lambda_type& qf, - std::shared_ptr > qdata, - std::vector argument_indices) +template +Integral MakeDomainIntegral(const Domain& domain, const lambda_type& qf, + std::shared_ptr> qdata, + std::vector argument_indices) { FunctionSignature signature; SLIC_ERROR_IF(domain.type_ != Domain::Type::Elements, "Error: trying to evaluate a domain integral over a boundary"); - Integral integral(domain, argument_indices); + Integral integral(domain, argument_indices); if constexpr (dim == 2) { - generate_kernels(signature, integral, qf, qdata); - generate_kernels(signature, integral, qf, qdata); + generate_kernels(signature, integral, qf, qdata); + generate_kernels(signature, integral, qf, qdata); } if constexpr (dim == 3) { - generate_kernels(signature, integral, qf, qdata); - generate_kernels(signature, integral, qf, qdata); + generate_kernels(signature, integral, qf, qdata); + generate_kernels(signature, integral, qf, qdata); } return integral; @@ -272,21 +286,28 @@ Integral MakeDomainIntegral(const Domain& domain, const lambda_type& qf, * @param integral the Integral object to initialize * @param qf the quadrature function */ -template -void generate_bdr_kernels(FunctionSignature s, Integral& integral, const lambda_type& qf) +template +void generate_bdr_kernels(FunctionSignature s, Integral& integral, const lambda_type& qf) { - integral.geometric_factors_[geom] = GeometricFactors(integral.domain_, Q, geom, FaceType::BOUNDARY); - GeometricFactors& gf = integral.geometric_factors_[geom]; + integral.geometric_factors_[geom] = GeometricFactors(integral.domain_, Q, geom, FaceType::BOUNDARY); + GeometricFactors& gf = integral.geometric_factors_[geom]; if (gf.num_elements == 0) return; - const double* positions = gf.X.Read(); - const double* jacobians = gf.J.Read(); + const double* positions = gf.X.Read(true); + const double* jacobians = gf.J.Read(true); const uint32_t num_elements = uint32_t(gf.num_elements); const uint32_t qpts_per_element = num_quadrature_points(geom, Q); - const int* elements = &gf.elements[0]; + + int* elements = nullptr; + if constexpr (exec == ExecutionSpace::GPU) { + elements = copy_data(const_cast(&gf.elements[0]), sizeof(int) * integral.domain_.get(geom).size(), "DEVICE"); + } else { + elements = &gf.elements[0]; + } std::shared_ptr dummy_derivatives; - integral.evaluation_[geom] = boundary_integral::evaluation_kernel( + integral.evaluation_[geom] = boundary_integral::evaluation_kernel( s, qf, positions, jacobians, dummy_derivatives, elements, num_elements); constexpr std::size_t num_args = s.num_args; @@ -298,15 +319,15 @@ void generate_bdr_kernels(FunctionSignature s, Integral& integr // action_of_gradient functor below to augment the reference count, and extend its lifetime to match // that of the boundaryIntegral that allocated it. using derivative_type = decltype(boundary_integral::get_derivative_type(qf)); - auto ptr = accelerator::make_shared_array(num_elements * qpts_per_element); + auto ptr = accelerator::make_shared_array(num_elements * qpts_per_element); - integral.evaluation_with_AD_[index][geom] = - boundary_integral::evaluation_kernel(s, qf, positions, jacobians, ptr, elements, num_elements); + integral.evaluation_with_AD_[index][geom] = boundary_integral::evaluation_kernel( + s, qf, positions, jacobians, ptr, elements, num_elements); integral.jvp_[index][geom] = - boundary_integral::jacobian_vector_product_kernel(s, ptr, elements, num_elements); + boundary_integral::jacobian_vector_product_kernel(s, ptr, elements, num_elements); integral.element_gradient_[index][geom] = - boundary_integral::element_gradient_kernel(s, ptr, elements, num_elements); + boundary_integral::element_gradient_kernel(s, ptr, elements, num_elements); }); } @@ -324,23 +345,23 @@ void generate_bdr_kernels(FunctionSignature s, Integral& integr * * @note this function is not meant to be called by users */ -template -Integral MakeBoundaryIntegral(const Domain& domain, const lambda_type& qf, std::vector argument_indices) +template +Integral MakeBoundaryIntegral(const Domain& domain, const lambda_type& qf, std::vector argument_indices) { FunctionSignature signature; SLIC_ERROR_IF(domain.type_ != Domain::Type::BoundaryElements, "Error: trying to evaluate a boundary integral over a non-boundary domain of integration"); - Integral integral(domain, argument_indices); + Integral integral(domain, argument_indices); if constexpr (dim == 1) { - generate_bdr_kernels(signature, integral, qf); + generate_bdr_kernels(signature, integral, qf); } if constexpr (dim == 2) { - generate_bdr_kernels(signature, integral, qf); - generate_bdr_kernels(signature, integral, qf); + generate_bdr_kernels(signature, integral, qf); + generate_bdr_kernels(signature, integral, qf); } return integral; diff --git a/src/serac/numerics/functional/quadrature_data.hpp b/src/serac/numerics/functional/quadrature_data.hpp index b3ace3c4e..570874b86 100644 --- a/src/serac/numerics/functional/quadrature_data.hpp +++ b/src/serac/numerics/functional/quadrature_data.hpp @@ -40,7 +40,17 @@ struct Nothing {}; /** * @brief see `Nothing` for a complete description of this class and when to use it */ -struct Empty {}; +struct Empty { + /** + * @brief operator= overload + * @tparam T anything can be set to Empty + */ + template + SERAC_HOST_DEVICE auto operator=(T) const + { + return Empty{}; + } +}; template struct QuadratureData; diff --git a/src/serac/numerics/functional/tensor.hpp b/src/serac/numerics/functional/tensor.hpp index e215e486e..1b769bc46 100644 --- a/src/serac/numerics/functional/tensor.hpp +++ b/src/serac/numerics/functional/tensor.hpp @@ -12,6 +12,24 @@ #pragma once +/** + * @brief Macro defining block size for CUDA/HIP targets + */ +#define BLOCK_SZ 128 +/** + * @brief Macro defining block dimension for CUDA/HIP targets + */ +#define BLOCK_X 8 +/** + * @brief Macro defining block dimension for CUDA/HIP targets + */ +#define BLOCK_Y 4 +/** + * @brief Macro defining block dimension for CUDA/HIP targets + */ +#define BLOCK_Z 4 + +#include "serac/serac_config.hpp" #include "serac/infrastructure/accelerator.hpp" #include "detail/metaprogramming.hpp" @@ -708,6 +726,7 @@ template SERAC_HOST_DEVICE constexpr auto dot(const tensor& A, const tensor& B) { tensor AB{}; + for (int i = 0; i < m; i++) { for (int j = 0; j < p; j++) { for (int k = 0; k < n; k++) { @@ -903,7 +922,7 @@ SERAC_HOST_DEVICE constexpr auto dot(const tensor& A, const tenso /// compute the cross product of the columns of A: A(:,1) x A(:,2) template -auto cross(const tensor& A) +auto SERAC_HOST_DEVICE cross(const tensor& A) { return tensor{A(1, 0) * A(2, 1) - A(2, 0) * A(1, 1), A(2, 0) * A(0, 1) - A(0, 0) * A(2, 1), A(0, 0) * A(1, 1) - A(1, 0) * A(0, 1)}; @@ -911,21 +930,21 @@ auto cross(const tensor& A) /// return the in-plane components of the cross product of {v[0], v[1], 0} x {0, 0, 1} template -auto cross(const tensor& v) +auto SERAC_HOST_DEVICE cross(const tensor& v) { return tensor{v(1, 0), -v(0, 0)}; } /// return the in-plane components of the cross product of {v[0], v[1], 0} x {0, 0, 1} template -auto cross(const tensor& v) +auto SERAC_HOST_DEVICE cross(const tensor& v) { return tensor{v[1], -v[0]}; } /// compute the (right handed) cross product of two 3-vectors template -auto cross(const tensor& u, const tensor& v) +auto SERAC_HOST_DEVICE cross(const tensor& u, const tensor& v) { return tensor{u(1) * v(2) - u(2) * v(1), u(2) * v(0) - u(0) * v(2), u(0) * v(1) - u(1) * v(0)}; @@ -1282,7 +1301,7 @@ SERAC_HOST_DEVICE constexpr auto detApIm1(const tensor& A) // clang-format off // equivalent to tr(A) + I2(A) + det(A) - return A(0, 0) + A(1, 1) + A(2, 2) + return A(0, 0) + A(1, 1) + A(2, 2) - A(0, 1) * A(1, 0) * (1 + A(2, 2)) + A(0, 0) * A(1, 1) * (1 + A(2, 2)) - A(0, 2) * A(2, 0) * (1 + A(1, 1)) @@ -1358,12 +1377,8 @@ SERAC_HOST_DEVICE auto contract(const tensor& A, const tensor{}; - if constexpr (d3 != 0) return tensor{}; - }(); - if constexpr (d3 == 0) { + auto C = tensor{}; for (int i = 0; i < d1; i++) { for (int j = 0; j < d2; j++) { U sum{}; @@ -1376,7 +1391,11 @@ SERAC_HOST_DEVICE auto contract(const tensor& A, const tensor{}; for (int i = 0; i < d1; i++) { for (int j = 0; j < d2; j++) { for (int k = 0; k < d3; k++) { @@ -1393,9 +1412,211 @@ SERAC_HOST_DEVICE auto contract(const tensor& A, const tensor +SERAC_HOST_DEVICE constexpr auto deduce_contract_return_type(const tensor&, const tensor&) +{ + constexpr int Adims[] = {m, n...}; + constexpr int Bdims[] = {p, q}; + static_assert(sizeof...(n) < 3); + static_assert(Adims[i1] == Bdims[i2], "error: incompatible tensor dimensions"); + + // first, we have to figure out the dimensions of the output tensor + constexpr int new_dim = (i2 == 0) ? q : p; + constexpr int d1 = (i1 == 0) ? new_dim : Adims[0]; + constexpr int d2 = (i1 == 1) ? new_dim : Adims[1]; + constexpr int d3 = sizeof...(n) == 1 ? 0 : ((i1 == 2) ? new_dim : Adims[2]); + using U = decltype(S{} * T{}); + if constexpr (d3 == 0) { + return serac::tensor{}; + } + return tensor{}; +} + +/// @overload +template +SERAC_HOST_DEVICE constexpr auto deduce_contract_return_type(const zero&, const tensor&) +{ + return zero{}; +} + +// TODO(bowen): Figure out using decltype vs helper structs for shared memory buffers +/* template + class contract_type { + +}; + + template + class contract_type, tensor> { + SERAC_HOST_DEVICE contract_type(const tensor&, const tensor&) {} + + static constexpr int Adims[] = {m, n...}; + static constexpr int Bdims[] = {p, q}; + static_assert(sizeof...(n) < 3); + static_assert(Adims[i1] == Bdims[i2], "error: incompatible tensor dimensions"); + + // first, we have to figure out the dimensions of the output tensor + static constexpr int new_dim = (i2 == 0) ? q : p; + static constexpr int d1 = (i1 == 0) ? new_dim : Adims[0]; + static constexpr int d2 = (i1 == 1) ? new_dim : Adims[1]; + static constexpr int d3 = sizeof...(n) == 1 ? 0 : ((i1 == 2) ? new_dim : Adims[2]); + using U = decltype(S{} * T{}); + // if constexpr (d3 == 0) { + // return serac::tensor{}; + // } + public: + using type = typename std::conditional, tensor>::type; + }; + + template + contract_type(tensor, tensor) -> contract_type<1, 1, tensor, tensor>; + + template + struct contract_type { + SERAC_HOST_DEVICE contract_type(const zero&, const S&) {} + using type = zero; + }; + + template + struct contract_type { + SERAC_HOST_DEVICE contract_type(const S&, const zero&) {} + using type = zero; + }; + */ + +/** + * @brief When deducing the result of a sum of contractions, e.g. the type + * using type = decltype(contract(A, B) + contract(C, D)) + * it is necessary to determine if both contractions result in a zero tensor. + * This simple struct returns has + * type = zero if S and T are identically zero + * type = S if T is zero + * type = T if S is zero + * type = S if neither is zero (S == T always in thius case) + */ +template +struct is_zero_and { +private: + static constexpr bool is_S_zero = std::is_same_v; + static constexpr bool is_T_zero = std::is_same_v; + using helper_t = typename std::conditional::type; + +public: + /// @brief type corresponding to the rules described in brief above. + using type = typename std::conditional::type; +}; + +/** + * @brief a convenience function that computes a dot product between + * two tensors, and allows the user to specify which indices to summed + * over. This overload is void, and accepts a pointer to the resultant + * tensor. Useful for GPU execution, where buffers must be filled + * manuall. + * + * @tparam i1 the index of contraction for the left operand + * @tparam i2 the index of contraction for the right operand + * @tparam S the datatype stored in the left operand + * @tparam m leading dimension of the left operand + * @tparam n the trailing dimensions of the left operand + * @tparam T the datatype stored in the right operand + * @tparam p the number of rows in the right operand + * @tparam q the number of columns in the right operand + * @tparam n0 the first dimension of the result + * @tparam n1 the second dimension of the result + * @tparam n2 the third dimension of the result + * @param A the left operand + * @param B the right operand + * @param C the result + * @param qx thread index for GPU backends + * @param qy thread index for GPU backends + * @param qz thread index for GPU backends + * @param accumulate control whether contract adds the result to C, or sets C. + */ +template +SERAC_HOST_DEVICE void contract(const tensor& A, const tensor& B, tensor* C, int qx, + int qy, int qz, bool accumulate = false) +{ + constexpr int Adims[] = {m, n...}; + constexpr int Bdims[] = {p, q}; + static_assert(sizeof...(n) < 3); + static_assert(Adims[l1] == Bdims[l2], "error: incompatible tensor dimensions"); + static_assert(sizeof...(n) != 1); + using U = decltype(S{} * T{}); + for (int i = qz; i < n0; i += BLOCK_Z) { + for (int j = qy; j < n1; j += BLOCK_Y) { + for (int k = qx; k < n2; k += BLOCK_X) { + U sum = 0.0; + + for (int l = 0; l < Adims[l1]; l++) { + if constexpr (l1 == 0 && l2 == 0) { + sum += A(l, j, k) * B(l, i); + } + if constexpr (l1 == 1 && l2 == 0) { + sum += A(i, l, k) * B(l, j); + } + if constexpr (l1 == 2 && l2 == 0) { + sum += A(i, j, l) * B(l, k); + } + if constexpr (l1 == 0 && l2 == 1) { + sum += A(l, j, k) * B(i, l); + } + if constexpr (l1 == 1 && l2 == 1) { + sum += A(i, l, k) * B(j, l); + } + if constexpr (l1 == 2 && l2 == 1) { + sum += A(i, j, l) * B(k, l); + } + } + if (accumulate) { + (*C)(i, j, k) += sum; + } else { + (*C)(i, j, k) = sum; + } + } + } + } +} + +/// @overload +template +SERAC_HOST_DEVICE void contract(const zero&, const tensor&, zero*, int, int, int, + [[maybe_unused]] bool accumulate = false) +{ + return; +} + +/// @overload +template +SERAC_HOST_DEVICE void contract(const zero&, const zero&, zero*, int, int, int, + [[maybe_unused]] bool accumulate = false) +{ + return; +} + +/// @overload +template +SERAC_HOST_DEVICE void contract(const tensor&, const zero&, zero*, int, int, int, + [[maybe_unused]] bool accumulate = false) +{ + return; +} + +/// @overload +template +SERAC_HOST_DEVICE void contract(const zero&, const tensor&, tensor*, int, int, int, + [[maybe_unused]] bool accumulate = false) +{ + return; } /// @overload @@ -1625,6 +1846,7 @@ SERAC_HOST_DEVICE constexpr tensor inv(const tensor& return invA; } + /** * @overload * @note For N-by-N matrices with N > 3, requires Gaussian elimination @@ -1678,8 +1900,8 @@ inline SERAC_HOST_DEVICE void print(double value) { printf("%f", value); } * @brief print a tensor using `printf`, so that it is suitable for use inside cuda kernels. * @param[in] A The tensor to write out */ -template -SERAC_HOST_DEVICE void print(const tensor& A) +template +SERAC_HOST_DEVICE void print(const tensor& A) { printf("{"); print(A[0]); @@ -1687,7 +1909,58 @@ SERAC_HOST_DEVICE void print(const tensor& A) printf(","); print(A[i]); } - printf("}"); + printf("}\n"); +} + +/** + * @brief print a tensor's shape using printf for use inside CUDA kenels + */ +template +SERAC_HOST_DEVICE void print_shape(const tensor&) +{ + printf("shape [ %d", m); + (printf(", %d", n), ...); + printf(" ]\n"); +} + +/** + * @brief helper function for setting a tensor to a value on GPU + * @param[in] A the tensor + * @param value to fill tensor + * @param qx thread index + * @param qy thread index + * @param qz thread index + */ +template +SERAC_HOST_DEVICE void memset_tensor(tensor& A, T value, int qx, int qy, int qz) +{ + for (int i = 0; i < m; ++i) { + memset_tensor(A[i], value, qx, qy, qz); + } +} + +/// @overload +template +SERAC_HOST_DEVICE void memset_tensor(serac::zero&, T, int, int, int) +{ +} + +/// @overload +template +SERAC_HOST_DEVICE void memset_tensor(tensor& A, T value, int qx, int qy, int) +{ + if (qx < m && qy < n) { + A(qx, qy) = value; + } +} + +/// @overload +template +SERAC_HOST_DEVICE void memset_tensor(tensor& A, T value, int qx, int qy, int qz) +{ + if (qx < m && qy < n && qz < l) { + A(qx, qy, qz) = value; + } } /** @@ -1881,6 +2154,28 @@ SERAC_HOST_DEVICE constexpr int size(const tensor&) return (n * ... * 1); } +/** + * @brief returns the total number of stored values in a tensor + * + * @tparam T the datatype stored in the tensor + * @tparam n the extents of each dimension + * @return the total number of values stored in the tensor + */ +template +SERAC_HOST_DEVICE constexpr int size(const tensor*) +{ + return (n * ... * 1); +} + +/** + * @brief returns the total number of stored values in a tensor + * + * @tparam T the datatype stored in the tensor + * @tparam n the extents of each dimension + * @return the total number of values stored in the tensor + */ +SERAC_HOST_DEVICE constexpr int size(const zero*) { return 1; } + /** * @overload * @brief overload of size() for `double`, we say a double "stores" 1 value @@ -1888,7 +2183,7 @@ SERAC_HOST_DEVICE constexpr int size(const tensor&) SERAC_HOST_DEVICE constexpr int size(const double&) { return 1; } /// @overload -SERAC_HOST_DEVICE constexpr int size(zero) { return 0; } +SERAC_HOST_DEVICE constexpr int size(zero) { return 1; } /** * @brief a function for querying the ith dimension of a tensor @@ -1931,6 +2226,37 @@ bool isnan(const tensor& A) /// @overload inline bool isnan(const zero&) { return false; } +/** + * @brief Helper function that uses umpire to move memory between HOST and DEVICE. + * destination must be either "HOST" or "DEVICE". + */ +template +DataType* copy_data(DataType* source_data, std::size_t size, const std::string& destination) +{ + auto& rm = umpire::ResourceManager::getInstance(); + auto dest_allocator = rm.getAllocator(destination); + + DataType* dest_data = static_cast(dest_allocator.allocate(size)); + + umpire::register_external_allocation( + source_data, umpire::util::AllocationRecord(source_data, size, rm.getAllocator("HOST").getAllocationStrategy(), + std::string("external array"))); + rm.copy(dest_data, source_data); + + return dest_data; +} + +/** + * @brief Helper function to deallocate memory allocated with umpire. + */ +template +void deallocate(DataType* data, const std::string& destination) +{ + auto& rm = umpire::ResourceManager::getInstance(); + auto dest_allocator = rm.getAllocator(destination); + dest_allocator.deallocate(data); +} + } // namespace serac #if 0 @@ -2033,7 +2359,7 @@ inline mat < 2, 2 > look_at(const vec < 2 > & direction) { inline mat < 3, 3 > R3_basis(const vec3 & n) { float sign = (n[2] >= 0.0f) ? 1.0f : -1.0f; - float a = -1.0f / (sign + n[2]); + float a = -1.0f / (sign + n[2]); float b = n[0] * n[1] * a; return mat < 3, 3 >{ diff --git a/src/serac/numerics/functional/tests/CMakeLists.txt b/src/serac/numerics/functional/tests/CMakeLists.txt index 9118582f2..6fe996c24 100644 --- a/src/serac/numerics/functional/tests/CMakeLists.txt +++ b/src/serac/numerics/functional/tests/CMakeLists.txt @@ -5,6 +5,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) set(functional_test_depends gtest serac_functional ${functional_depends}) +blt_list_append(TO functional_depends ELEMENTS blt::cuda IF ENABLE_CUDA) # This test is all constexpr-evaluated, so it doesn't # actually need to run, if it compiles without error, the tests have passed @@ -58,15 +59,21 @@ target_link_libraries(bug_boundary_qoi PUBLIC serac_physics) if(ENABLE_CUDA) - set(functional_cuda_test_sources - tensor_unit_tests_cuda.cu -# some of the GPU functionality is temporarily disabled to -# help incrementally roll-out the variadic implementation of Functional -# TODO: re-enable GPU kernels in a follow-up PR -# functional_comparisons_cuda.cu - ) + set(functional_tests_cuda + # functional_basic_hcurl.cpp + # functional_with_domain.cpp + functional_basic_h1_scalar.cpp + # functional_basic_h1_vector.cpp + # functional_multiphysics.cpp + # functional_qoi.cpp + # functional_nonlinear.cpp + # functional_boundary_test.cpp + functional_comparisons.cpp + # functional_comparison_L2.cpp + ) - serac_add_tests( SOURCES ${functional_cuda_test_sources} - DEPENDS_ON ${functional_test_depends}) + serac_add_tests( SOURCES ${functional_tests_cuda} + USE_CUDA ON + DEPENDS_ON gtest serac_functional serac_state ${functional_depends} blt::cuda) endif() diff --git a/src/serac/numerics/functional/tests/check_gradient.hpp b/src/serac/numerics/functional/tests/check_gradient.hpp index fb48070f3..c3cabe2a6 100644 --- a/src/serac/numerics/functional/tests/check_gradient.hpp +++ b/src/serac/numerics/functional/tests/check_gradient.hpp @@ -25,7 +25,8 @@ void check_gradient(serac::Functional& f, double t, mfem::Vector& U, do } dU.Randomize(seed); - auto [value, dfdU] = f(t, serac::differentiate_wrt(U)); + auto [value, dfdU] = f(t, serac::differentiate_wrt(U)); + std::unique_ptr dfdU_matrix = assemble(dfdU); // jacobian vector products @@ -54,6 +55,8 @@ void check_gradient(serac::Functional& f, double t, mfem::Vector& U, do // forward-difference approximations mfem::Vector df1_fd[2]; + df1_fd[0].UseDevice(true); + df1_fd[1].UseDevice(true); df1_fd[0] = f_values[4]; df1_fd[0] -= f_values[2]; df1_fd[0] /= (2.0 * epsilon); @@ -64,6 +67,8 @@ void check_gradient(serac::Functional& f, double t, mfem::Vector& U, do // center-difference approximations mfem::Vector df1_cd[2]; + df1_cd[0].UseDevice(true); + df1_cd[1].UseDevice(true); df1_cd[0] = f_values[4]; df1_cd[0] -= f_values[0]; df1_cd[0] /= (4.0 * epsilon); @@ -78,14 +83,14 @@ void check_gradient(serac::Functional& f, double t, mfem::Vector& U, do // halving epsilon should make the error decrease // by about a factor of two for the forward-difference stencil - double e1 = df1_fd[0].DistanceTo(df_jvp1.GetData()) / denominator; - double e2 = df1_fd[1].DistanceTo(df_jvp1.GetData()) / denominator; + double e1 = df_jvp1.DistanceTo(df1_fd[0].HostRead()) / denominator; + double e2 = df_jvp1.DistanceTo(df1_fd[1].HostRead()) / denominator; EXPECT_TRUE(fabs(e1 / e2 - 2.0) < 0.1 || fmin(e1, e2) < 1.0e-9); // halving epsilon should make the error decrease // by about a factor of four for the center-difference stencil - double e3 = df1_cd[0].DistanceTo(df_jvp1.GetData()) / denominator; - double e4 = df1_cd[1].DistanceTo(df_jvp1.GetData()) / denominator; + double e3 = df_jvp1.DistanceTo(df1_cd[0].HostRead()) / denominator; + double e4 = df_jvp1.DistanceTo(df1_cd[1].HostRead()) / denominator; EXPECT_TRUE((fabs(e3 / e4 - 4.0) < 0.1) || fmin(e3, e4) < 1.0e-9); } @@ -96,9 +101,11 @@ void check_gradient(serac::Functional& f, double t, const mfem::Vector& U, co int seed = 42; mfem::Vector dU(U.Size()); + dU.UseDevice(true); dU.Randomize(seed); mfem::Vector ddU_dt(U.Size()); + ddU_dt.UseDevice(true); ddU_dt.Randomize(seed + 1); { diff --git a/src/serac/numerics/functional/tests/functional_basic_h1_scalar.cpp b/src/serac/numerics/functional/tests/functional_basic_h1_scalar.cpp index 7a4fc6118..c3466add6 100644 --- a/src/serac/numerics/functional/tests/functional_basic_h1_scalar.cpp +++ b/src/serac/numerics/functional/tests/functional_basic_h1_scalar.cpp @@ -69,7 +69,7 @@ void thermal_test_impl(std::unique_ptr& mesh) auto [trial_fespace, trial_fec] = serac::generateParFiniteElementSpace(mesh.get()); mfem::Vector U(trial_fespace->TrueVSize()); - + U.UseDevice(true); mfem::ParGridFunction U_gf(trial_fespace.get()); mfem::FunctionCoefficient x_squared([](mfem::Vector x) { return x[0] * x[0]; }); U_gf.ProjectCoefficient(x_squared); @@ -115,7 +115,7 @@ TEST(mixed, thermal_tets_and_hexes) { thermal_test<2, 1>("/data/meshes/patch3D_t int main(int argc, char* argv[]) { - serac::accelerator::initializeDevice(); + serac::accelerator::initializeDevice(exec_space); ::testing::InitGoogleTest(&argc, argv); int num_procs, myid; @@ -128,7 +128,7 @@ int main(int argc, char* argv[]) int result = RUN_ALL_TESTS(); MPI_Finalize(); - serac::accelerator::terminateDevice(); + serac::accelerator::terminateDevice(exec_space); return result; } diff --git a/src/serac/numerics/functional/tests/functional_comparisons.cpp b/src/serac/numerics/functional/tests/functional_comparisons.cpp index 99f4ae9a2..952edf416 100644 --- a/src/serac/numerics/functional/tests/functional_comparisons.cpp +++ b/src/serac/numerics/functional/tests/functional_comparisons.cpp @@ -200,6 +200,9 @@ void functional_test(mfem::ParMesh& mesh, H1

test, H1

trial, Dimension test, H1 trial, Dim EXPECT_NEAR(0., diff1.Norml2() / g1.Norml2(), 1.e-14); EXPECT_NEAR(0., diff2.Norml2() / g1.Norml2(), 1.e-14); +#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION + printCUDAMemUsage(); +#endif } // this test sets up part of a toy "magnetic diffusion" problem where the residual includes contributions @@ -425,6 +431,9 @@ void functional_test(mfem::ParMesh& mesh, Hcurl

test, Hcurl

trial, Dimensi } EXPECT_NEAR(0., diff1.Norml2() / g1.Norml2(), 1.e-13); EXPECT_NEAR(0., diff2.Norml2() / g1.Norml2(), 1.e-13); +#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION + printCUDAMemUsage(); +#endif } #ifndef SERAC_USE_CUDA_KERNEL_EVALUATION @@ -455,7 +464,7 @@ TEST(Elasticity, 3DCubic) { functional_test(*mesh3D, H1<3, 3>{}, H1<3, 3>{}, Dim int main(int argc, char* argv[]) { - serac::accelerator::initializeDevice(); + serac::accelerator::initializeDevice(exec_space); ::testing::InitGoogleTest(&argc, argv); MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &num_procs); @@ -494,6 +503,6 @@ int main(int argc, char* argv[]) int result = RUN_ALL_TESTS(); MPI_Finalize(); - serac::accelerator::terminateDevice(); + serac::accelerator::terminateDevice(exec_space); return result; } diff --git a/src/serac/numerics/functional/tests/functional_shape_derivatives.cpp b/src/serac/numerics/functional/tests/functional_shape_derivatives.cpp index b79841bc9..e8ac9b70a 100644 --- a/src/serac/numerics/functional/tests/functional_shape_derivatives.cpp +++ b/src/serac/numerics/functional/tests/functional_shape_derivatives.cpp @@ -67,6 +67,7 @@ #include "mfem.hpp" #include "axom/slic/core/SimpleLogger.hpp" +#include "serac/infrastructure/accelerator.hpp" #include "serac/infrastructure/input.hpp" #include "serac/serac_config.hpp" #include "serac/mesh/mesh_utils_base.hpp" @@ -88,7 +89,7 @@ std::unique_ptr mesh2D; std::unique_ptr mesh3D; template -auto monomials(tensor X) +SERAC_HOST_DEVICE auto monomials(tensor X) { if constexpr (dim == 2) { tensor output; @@ -122,7 +123,7 @@ auto monomials(tensor X) } template -auto grad_monomials([[maybe_unused]] tensor X) +SERAC_HOST_DEVICE auto grad_monomials([[maybe_unused]] tensor X) { if constexpr (dim == 2) { tensor output; diff --git a/src/serac/numerics/functional/tests/functional_with_domain.cpp b/src/serac/numerics/functional/tests/functional_with_domain.cpp index cdefd63c5..4b0193ddf 100644 --- a/src/serac/numerics/functional/tests/functional_with_domain.cpp +++ b/src/serac/numerics/functional/tests/functional_with_domain.cpp @@ -159,6 +159,8 @@ void whole_mesh_comparison_test(std::string meshfile) } } +TEST(basic, whole_mesh_comparison_hexes) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch3D_hexes.mesh"); } +#ifndef SERAC_USE_CUDA_KERNEL_EVALUATION TEST(basic, whole_mesh_comparison_tris) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch2D_tris.mesh"); } TEST(basic, whole_mesh_comparison_quads) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch2D_quads.mesh"); } TEST(basic, whole_mesh_comparison_tris_and_quads) @@ -167,7 +169,6 @@ TEST(basic, whole_mesh_comparison_tris_and_quads) } TEST(basic, whole_mesh_comparison_tets) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch3D_tets.mesh"); } -TEST(basic, whole_mesh_comparison_hexes) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch3D_hexes.mesh"); } TEST(basic, whole_mesh_comparison_tets_and_hexes) { whole_mesh_comparison_test<1, 1>("/data/meshes/patch3D_tets_and_hexes.mesh"); @@ -316,11 +317,14 @@ TEST(qoi, partial_domain) EXPECT_NEAR(volume, 4.0, 1.0e-14); } +#endif int main(int argc, char* argv[]) { ::testing::InitGoogleTest(&argc, argv); - +#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION + serac::accelerator::initializeDevice(); +#endif int num_procs, myid; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &num_procs); @@ -331,6 +335,8 @@ int main(int argc, char* argv[]) int result = RUN_ALL_TESTS(); MPI_Finalize(); - +#ifdef SERAC_USE_CUDA_KERNEL_EVALUATION + serac::accelerator::terminateDevice(); +#endif return result; } diff --git a/src/serac/numerics/functional/tuple.hpp b/src/serac/numerics/functional/tuple.hpp index bc2b571d0..c78a365cf 100644 --- a/src/serac/numerics/functional/tuple.hpp +++ b/src/serac/numerics/functional/tuple.hpp @@ -13,6 +13,7 @@ #include +#include "serac/serac_config.hpp" #include "serac/infrastructure/accelerator.hpp" namespace serac { @@ -186,10 +187,44 @@ template struct tuple_size { }; +template +struct tuple_size_ptr { +}; + template struct tuple_size> : std::integral_constant { }; +template +struct tuple_size_ptr> : std::integral_constant { +}; + +/** + * @brief returns the total number of stored values in a tensor + * + * @tparam T the datatype stored in the tensor + * @tparam n the extents of each dimension + * @return the total number of values stored in the tensor + */ +template +SERAC_HOST_DEVICE constexpr int size(const serac::tuple) +{ + return tuple_size>::value; +} + +/** + * @brief returns the total number of stored values in a tensor + * + * @tparam T the datatype stored in the tensor + * @tparam n the extents of each dimension + * @return the total number of values stored in the tensor + */ +template +SERAC_HOST_DEVICE constexpr int size(const serac::tuple) +{ + return tuple_size_ptr>::value; +} + /** * @tparam i the tuple index to access * @tparam T the types stored in the tuple @@ -566,8 +601,8 @@ SERAC_HOST_DEVICE constexpr auto operator*(const tuple& x, const tuple -SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const tuple& x, std::integer_sequence) +template +SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const TupleType& x, std::integer_sequence) { return tuple{a * get(x)...}; } @@ -581,8 +616,8 @@ SERAC_HOST_DEVICE constexpr auto mult_helper(const double a, const tuple& * @param a a constant multiplier * @return the returned tuple product */ -template -SERAC_HOST_DEVICE constexpr auto mult_helper(const tuple& x, const double a, std::integer_sequence) +template +SERAC_HOST_DEVICE constexpr auto mult_helper(const TupleType& x, const double a, std::integer_sequence) { return tuple{get(x) * a...}; } @@ -639,6 +674,33 @@ auto& operator<<(std::ostream& out, const serac::tuple& A) return print_helper(out, A, std::make_integer_sequence()); } +/** + * @brief Overload for the print function to print tuples or tensors of tuples. + Enables calling print(tensor, ...> a); + + * @tparam T The tuple types + * @tparam i The integer sequence to i + * @param A the tensor to print +*/ +template +SERAC_HOST_DEVICE void print(const serac::tuple& A, std::integer_sequence) +{ + (print(get(A)), ...); +} + +/** + * @brief Helper function for the above overload + + * @tparam T The tuple types + * @tparam i The integer sequence to i + * @param A the tensor to print +*/ +template +SERAC_HOST_DEVICE void print(const serac::tuple& A) +{ + print(A, std::make_integer_sequence()); +} + /** * @brief A helper to apply a lambda to a tuple * diff --git a/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp b/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp index e44a31597..3d335fb87 100644 --- a/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp +++ b/src/serac/numerics/functional/tuple_tensor_dual_functions.hpp @@ -1,8 +1,17 @@ +// Copyright (c) 2019-2023, Lawrence Livermore National Security, LLC and +// other Serac Project Developers. See the top-level LICENSE file for +// details. +// +// SPDX-License-Identifier: (BSD-3-Clause) + #pragma once +#include "serac/infrastructure/accelerator.hpp" +#include "serac/serac_config.hpp" #include "serac/numerics/functional/tuple.hpp" #include "serac/numerics/functional/tensor.hpp" #include "serac/numerics/functional/dual.hpp" +#include "RAJA/RAJA.hpp" #include "mfem.hpp" @@ -212,26 +221,51 @@ SERAC_HOST_DEVICE auto promote_to_dual_when(const T& x) } /** - * @brief a function that optionally (decided at compile time) converts a list of values to their dual types + * @brief Deduce return type of promote_each_to_dual_when. Useful for type deduction while + * allocating memory * * @tparam dualify specify whether or not the input should be made into its dual type * @tparam T the type of the values passed in * @tparam n how many values were passed in - * @param x the values to be promoted */ template -SERAC_HOST_DEVICE auto promote_each_to_dual_when(const tensor& x) +auto promote_each_to_dual_when_output_helper(const tensor&) { if constexpr (dualify) { using return_type = decltype(make_dual(T{})); - tensor output; - for (int i = 0; i < n; i++) { - output[i] = make_dual(x[i]); - } - return output; + return tensor{}; } if constexpr (!dualify) { - return x; + return tensor{}; + } +} + +/** + * @brief a function that optionally (decided at compile time) converts a list of values to their dual types + * + * @tparam dualify specify whether or not the input should be made into its dual type + * @tparam T the type of the values passed in + * @tparam n how many values were passed in + * @param x the values to be promoted + * @param output_ptr pointer to output array + * @param ctx the RAJA launch context used to synchronize threads and required by the RAJA API. + */ +template +SERAC_HOST_DEVICE void promote_each_to_dual_when(const tensor& x, void* output_ptr, + RAJA::LaunchContext ctx = RAJA::LaunchContext{}) +{ + if constexpr (dualify) { + RAJA::RangeSegment x_range(0, n); + using return_type = decltype(make_dual(T{})); + auto casted_output_ptr = static_cast*>(output_ptr); + RAJA::loop::threads_t>( + ctx, x_range, [&](int i) { (*casted_output_ptr)[i] = make_dual(x[i]); }); + } + if constexpr (!dualify) { + RAJA::RangeSegment x_range(0, n); + auto casted_output_ptr = static_cast*>(output_ptr); + RAJA::loop::threads_t>(ctx, x_range, + [&](int i) { (*casted_output_ptr)[i] = x[i]; }); } } @@ -438,7 +472,8 @@ SERAC_HOST_DEVICE constexpr auto linear_solve(const tensor& A, const te if constexpr (is_zero{}) { return x; - } else { + } + if constexpr (!is_zero{}) { return make_dual(x, dx); } } @@ -799,7 +834,7 @@ SERAC_HOST_DEVICE tensor argsort(const tensor& v) * @note based on "A robust algorithm for finding the eigenvalues and * eigenvectors of 3x3 symmetric matrices", by Scherzinger & Dohrmann */ -inline SERAC_HOST_DEVICE tuple eig_symm(const mat3& A) +inline tuple eig_symm(const mat3& A) { // We know of optimizations for this routine. When this becomes the // bottleneck, we can revisit. See OptimiSM for details. @@ -971,7 +1006,8 @@ auto symmetric_mat3_function(tensor A, const Function& f, const EigvalS if constexpr (!is_dual_number::value) { return f_A; - } else { + } + if constexpr (is_dual_number::value) { return symmetric_mat3_function_with_derivative(A, f_A, lambda, Q, g); } } diff --git a/src/serac/physics/materials/liquid_crystal_elastomer.hpp b/src/serac/physics/materials/liquid_crystal_elastomer.hpp index 17c30f82a..b57686e9f 100644 --- a/src/serac/physics/materials/liquid_crystal_elastomer.hpp +++ b/src/serac/physics/materials/liquid_crystal_elastomer.hpp @@ -255,7 +255,7 @@ struct LiquidCrystalElastomerBertoldi { * @return The calculated material response (Cauchy stress) for the material */ template - SERAC_HOST_DEVICE auto operator()(State& /*state*/, const tensor& displacement_grad, + SERAC_HOST_DEVICE auto operator()(const State& /*state*/, const tensor& displacement_grad, OrderParamType inst_order_param_tuple, GammaAngleType gamma_tuple, EtaAngleType eta_tuple) const { diff --git a/src/serac/physics/materials/parameterized_solid_material.hpp b/src/serac/physics/materials/parameterized_solid_material.hpp index f812c4b3a..c5f963d57 100644 --- a/src/serac/physics/materials/parameterized_solid_material.hpp +++ b/src/serac/physics/materials/parameterized_solid_material.hpp @@ -38,7 +38,7 @@ struct ParameterizedLinearIsotropicSolid { * @return The calculated material response (Cauchy stress) for the material */ template - SERAC_HOST_DEVICE auto operator()(State& /*state*/, const serac::tensor& du_dX, + SERAC_HOST_DEVICE auto operator()(const State& /*state*/, const serac::tensor& du_dX, const BulkType& DeltaK, const ShearType& DeltaG) const { constexpr auto I = Identity(); @@ -81,7 +81,7 @@ struct ParameterizedNeoHookeanSolid { * @return The calculated material response (Cauchy stress) for the material */ template - SERAC_HOST_DEVICE auto operator()(State& /*state*/, const serac::tensor& du_dX, + SERAC_HOST_DEVICE auto operator()(const State& /*state*/, const serac::tensor& du_dX, const BulkType& DeltaK, const ShearType& DeltaG) const { using std::log1p; diff --git a/src/serac/physics/materials/solid_material.hpp b/src/serac/physics/materials/solid_material.hpp index 1b0b2d7b8..5fac54b87 100644 --- a/src/serac/physics/materials/solid_material.hpp +++ b/src/serac/physics/materials/solid_material.hpp @@ -36,7 +36,7 @@ struct LinearIsotropic { * @return The Cauchy stress */ template - SERAC_HOST_DEVICE auto operator()(State& /* state */, const tensor& du_dX) const + SERAC_HOST_DEVICE auto operator()(const State& /* state */, const tensor& du_dX) const { auto I = Identity(); auto lambda = K - (2.0 / 3.0) * G; @@ -72,7 +72,7 @@ struct StVenantKirchhoff { * @return The Cauchy stress */ template - auto operator()(State&, const tensor& grad_u) const + auto operator()(const State&, const tensor& grad_u) const { static constexpr auto I = Identity(); auto F = grad_u + I; @@ -110,7 +110,7 @@ struct NeoHookean { * @return The Cauchy stress */ template - SERAC_HOST_DEVICE auto operator()(State& /* state */, const tensor& du_dX) const + SERAC_HOST_DEVICE auto operator()(const State& /* state */, const tensor& du_dX) const { using std::log1p; constexpr auto I = Identity(); diff --git a/src/serac/physics/tests/quasistatic_solid_adjoint.cpp b/src/serac/physics/tests/quasistatic_solid_adjoint.cpp index bf13b2868..5962f5686 100644 --- a/src/serac/physics/tests/quasistatic_solid_adjoint.cpp +++ b/src/serac/physics/tests/quasistatic_solid_adjoint.cpp @@ -22,7 +22,7 @@ struct ParameterizedLinearIsotropicSolid { using State = ::serac::Empty; ///< this material has no internal variables template - SERAC_HOST_DEVICE auto operator()(State&, const ::serac::tensor& u_grad, const T2& E_tuple, + SERAC_HOST_DEVICE auto operator()(const State&, const ::serac::tensor& u_grad, const T2& E_tuple, const T3& v_tuple) const { auto E = ::serac::get<0>(E_tuple); // Young's modulus VALUE @@ -40,7 +40,7 @@ struct ParameterizedNeoHookeanSolid { using State = ::serac::Empty; // this material has no internal variables template - SERAC_HOST_DEVICE auto operator()(State&, const ::serac::tensor& du_dX, const T2& E_tuple, + SERAC_HOST_DEVICE auto operator()(const State&, const ::serac::tensor& du_dX, const T2& E_tuple, const T3& v_tuple) const { using std::log1p;