From c71e181d961f733b5b8092fd07903f2bece5e27d Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Wed, 2 Oct 2024 16:14:00 -0600
Subject: [PATCH 01/13] update log text

---
 include/ElementProcessor.h         | 12 +++--
 include/ElementReproducingKernel.h |  5 ++
 include/NeighborSearchProcessor.h  | 78 ++++++++----------------------
 protego-mech                       |  2 +-
 src/Application.cpp                | 39 +++++++++++++--
 src/InitialConditionUtil.cpp       |  4 +-
 src/MassUtils.cpp                  |  2 +-
 src/Solver.cpp                     | 14 ++++--
 src/main.cpp                       |  7 +++
 9 files changed, 90 insertions(+), 73 deletions(-)
diff --git a/include/ElementProcessor.h b/include/ElementProcessor.h
index 82184cd7..0ca4cb8c 100644
--- a/include/ElementProcessor.h
+++ b/include/ElementProcessor.h
@@ -2,6 +2,7 @@
 
 #include <Eigen/Dense>
 #include <array>
+#include <chrono>
 #include <memory>
 #include <stk_mesh/base/BulkData.hpp>
 #include <stk_mesh/base/Field.hpp>
@@ -565,7 +566,8 @@ class StrainSmoothingProcessor {
            - The node function derivatives need to be computed.
         */
 
-        aperi::CoutP0() << "Building smoothed cell data." << std::endl;
+        aperi::CoutP0() << "   - Building Smoothed Cell Data." << std::endl;
+        auto start_time = std::chrono::high_resolution_clock::now();
 
         // Create the cells selector
         std::vector<std::string> cells_sets;
@@ -799,10 +801,10 @@ class StrainSmoothingProcessor {
             }
         }
         average_num_nodes /= static_cast<double>(num_cells);
-        aperi::CoutP0() << "Average number of points defining a cell: " << average_num_nodes << std::endl;
+        aperi::CoutP0() << "     - Average number of points defining a cell: " << average_num_nodes << std::endl;
         if (one_pass_method) {
             average_num_neighbors /= static_cast<double>(num_cells);
-            aperi::CoutP0() << "Average number of neighbors for a cell: " << average_num_neighbors << std::endl;
+            aperi::CoutP0() << "     - Average number of neighbors for a cell: " << average_num_neighbors << std::endl;
         }
         bool set_start_from_lengths = false;  // The start array is already set above. This can be done as we are on host and looping through sequentially.
         smoothed_cell_data->CompleteAddingCellNodeIndicesOnHost(set_start_from_lengths);
@@ -810,6 +812,10 @@ class StrainSmoothingProcessor {
 
         assert(CheckPartitionOfNullity(smoothed_cell_data));
 
+        auto end_time = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+        aperi::CoutP0() << "     Finished building Smoothed Cell Data. Time: " << duration.count() << " ms." << std::endl;
+
         return smoothed_cell_data;
     }
 
diff --git a/include/ElementReproducingKernel.h b/include/ElementReproducingKernel.h
index 202b4ace..b6ec3337 100644
--- a/include/ElementReproducingKernel.h
+++ b/include/ElementReproducingKernel.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Eigen/Dense>
+#include <chrono>
 #include <memory>
 #include <stdexcept>
 #include <string>
@@ -65,6 +66,8 @@ class ElementReproducingKernel : public ElementBase {
     }
 
     void ComputeAndStoreFunctionValues() {
+        aperi::CoutP0() << "   - Computing and storing function values" << std::endl;
+        auto start_function_values = std::chrono::high_resolution_clock::now();
         // Functor for computing shape function values at nodes
         size_t compute_node_functions_functor_size = sizeof(ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS>);
         auto compute_node_functions_functor = (ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS> *)Kokkos::kokkos_malloc(compute_node_functions_functor_size);
@@ -86,6 +89,8 @@ class ElementReproducingKernel : public ElementBase {
             });
 
         Kokkos::kokkos_free(compute_node_functions_functor);
+        auto end_function_values = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "     Finished Computing and Storing Function Values. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_function_values - start_function_values).count() << " ms" << std::endl;
     }
 
     /**
diff --git a/include/NeighborSearchProcessor.h b/include/NeighborSearchProcessor.h
index dc31e7fb..2c496e07 100644
--- a/include/NeighborSearchProcessor.h
+++ b/include/NeighborSearchProcessor.h
@@ -3,6 +3,7 @@
 #include <Eigen/Dense>
 #include <Kokkos_Core.hpp>
 #include <array>
+#include <chrono>
 #include <memory>
 #include <stk_mesh/base/BulkData.hpp>
 #include <stk_mesh/base/Field.hpp>
@@ -470,6 +471,7 @@ class NeighborSearchProcessor {
     }
 
     void DoBallSearch(bool populate_debug_fields = false) {
+        auto start_search_time = std::chrono::high_resolution_clock::now();
         DomainViewType node_points = CreateNodePoints();
         RangeViewType node_spheres = CreateNodeSpheres();
 
@@ -480,36 +482,21 @@ class NeighborSearchProcessor {
         const bool results_parallel_symmetry = true;
 
         stk::search::coarse_search(node_points, node_spheres, search_method, m_bulk_data->parallel(), search_results, exec_space, results_parallel_symmetry);
+        auto end_search_time = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "     - Time to search: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_search_time - start_search_time).count() << " ms" << std::endl;
+        auto start_copy_and_ghost_time = std::chrono::high_resolution_clock::now();
 
         ResultViewType::HostMirror host_search_results = Kokkos::create_mirror_view(search_results);
         Kokkos::deep_copy(host_search_results, search_results);
 
-        // Print sizes
-        aperi::CoutP0() << "Neighborhood Search Information:" << std::endl;
-        aperi::Cout() << "\n  Search Point-Sphere Pair Results Size: " << host_search_results.size()
-                      << "\n  Evaluation Points Size: " << node_points.size()
-                      << "\n  Neighbor Spheres Size: " << node_spheres.size() << std::endl;
-
-        // Print for debugging
-        // aperi::CoutP0() << "Search points:" << std::endl;
-        // for (size_t i = 0; i < node_points.size(); ++i) {
-        //     auto point = node_points(i);
-        //     aperi::CoutP0() << "point: " << point.box << std::endl;
-        // }
-        // aperi::CoutP0() << "Search spheres:" << std::endl;
-        // for (size_t i = 0; i < node_spheres.size(); ++i) {
-        //     auto sphere = node_spheres(i);
-        //     aperi::CoutP0() << "sphere: " << sphere.box.center() << " radius: " << sphere.box.radius() << std::endl;
-        // }
-        // aperi::CoutP0() << "Search results:" << std::endl;
-        // for (size_t i = 0; i < host_search_results.size(); ++i) {
-        //     auto result = host_search_results(i);
-        //     aperi::CoutP0() << "domain: " << result.domainIdentProc.id() << " range: " << result.rangeIdentProc.id() << std::endl;
-        // }
-
         GhostNodeNeighbors(host_search_results);
+        auto end_copy_and_ghost_time = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "     - Time to copy and ghost: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_copy_and_ghost_time - start_copy_and_ghost_time).count() << " ms" << std::endl;
+        auto start_unpack_time = std::chrono::high_resolution_clock::now();
 
         UnpackSearchResultsIntoField(host_search_results);
+        auto end_unpack_time = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "     - Time to unpack: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_unpack_time - start_unpack_time).count() << " ms" << std::endl;
 
         // Check the validity of the neighbors field
         assert(CheckAllNeighborsAreWithinKernelRadius());
@@ -519,39 +506,15 @@ class NeighborSearchProcessor {
         if (populate_debug_fields) {
             PopulateDebugFields();
         }
-
-        // FastMeshIndicesViewType node_indices = GetLocalEntityIndices(stk::topology::NODE_RANK, m_selector);
-
-        // int rank;
-        // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-        // int num_procs;
-        // MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
-
-        // auto ngp_mesh = m_ngp_mesh;
-        // for (int i = 0; i < num_procs; ++i){
-        //     if (rank == i){
-        //         // Print local offset
-        //         std::cout << "NSP Rank: " << rank << std::endl;
-        //         stk::mesh::for_each_entity_run(
-        //             ngp_mesh, stk::topology::NODE_RANK, m_selector,
-        //             KOKKOS_LAMBDA(const stk::mesh::FastMeshIndex &node_index) {
-        //                 stk::mesh::Entity node = ngp_mesh.get_entity(stk::topology::NODE_RANK, node_index);
-        //                 std::cout << "local_offset: " << node.local_offset() << std::endl;
-        //                 std::cout << "node_id: " << ngp_mesh.identifier(node) << std::endl;
-        //             });
-
-        //         // Print node_indices
-        //         for (size_t i = 0; i < node_indices.size(); ++i) {
-        //             std::cout << "node_indices: " << node_indices(i).bucket_id << "-" << node_indices(i).bucket_ord << std::endl;
-        //         }
-        //     }
-        //     MPI_Barrier(MPI_COMM_WORLD);
-        // }
     }
 
     void add_nodes_neighbors_within_variable_ball(double scale_factor, bool populate_debug_fields = false) {
+        aperi::CoutP0() << "   - Finding Neighbors." << std::endl;
+        auto start_time = std::chrono::high_resolution_clock::now();
         ComputeKernelRadius(scale_factor);
         DoBallSearch(populate_debug_fields);
+        auto end_time = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "   - Time to find neighbors: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count() << " ms" << std::endl;
     }
 
     void add_nodes_neighbors_within_constant_ball(double ball_radius, bool populate_debug_fields = false) {
@@ -608,13 +571,12 @@ class NeighborSearchProcessor {
         // Node
         std::map<std::string, double> node_stats = GetNumNeighborStats();
 
-        aperi::CoutP0() << "Node Stats: " << std::endl;
-        aperi::CoutP0() << "    Total Num Nodes: " << node_stats["num_entities"] << std::endl;
-        aperi::CoutP0() << "  Max Num Neighbors: " << node_stats["max_num_neighbors"] << std::endl;
-        aperi::CoutP0() << "  Min Num Neighbors: " << node_stats["min_num_neighbors"] << std::endl;
-        aperi::CoutP0() << "  Avg Num Neighbors: " << node_stats["avg_num_neighbors"] << std::endl;
-        aperi::CoutP0() << "  Reserved Memory Utilization: " << node_stats["reserved_memory_utilization"] << "%" << std::endl
-                        << std::endl;  // Add a new line for readability
+        aperi::CoutP0() << "   - Neighbor Stats: " << std::endl;
+        aperi::CoutP0() << "     - Total Num Nodes: " << node_stats["num_entities"] << std::endl;
+        aperi::CoutP0() << "     - Max Num Neighbors: " << node_stats["max_num_neighbors"] << std::endl;
+        aperi::CoutP0() << "     - Min Num Neighbors: " << node_stats["min_num_neighbors"] << std::endl;
+        aperi::CoutP0() << "     - Avg Num Neighbors: " << node_stats["avg_num_neighbors"] << std::endl;
+        aperi::CoutP0() << "     - Reserved Memory Utilization: " << node_stats["reserved_memory_utilization"] << "%" << std::endl;
     }
 
     void SyncFieldsToHost() {
diff --git a/protego-mech b/protego-mech
index 4a259d91..e07d0f18 160000
--- a/protego-mech
+++ b/protego-mech
@@ -1 +1 @@
-Subproject commit 4a259d9171f9a792e98eabd09dbb5555fcde758c
+Subproject commit e07d0f18f3504089eeb5313df1e23d3e00aa3dbb
diff --git a/src/Application.cpp b/src/Application.cpp
index 25e4f64c..0f17af96 100644
--- a/src/Application.cpp
+++ b/src/Application.cpp
@@ -2,6 +2,8 @@
 
 #include <yaml-cpp/yaml.h>
 
+#include <chrono>
+
 #include "BoundaryCondition.h"
 #include "ExternalForceContribution.h"
 #include "FieldData.h"
@@ -20,6 +22,10 @@
 namespace aperi {
 
 void Application::Run(const std::string& input_filename) {
+    aperi::CoutP0() << "############################################" << std::endl;
+    aperi::CoutP0() << "Starting Application" << std::endl;
+    aperi::CoutP0() << " - Reading Input File and Mesh" << std::endl;
+    auto start_mesh_read = std::chrono::high_resolution_clock::now();
     // TODO(jake): hard coding to 1 procedure for now. Fix this when we have multiple procedures.
     int procedure_id = 0;
 
@@ -48,12 +54,18 @@ void Application::Run(const std::string& input_filename) {
 
     // Read the mesh
     m_io_mesh->ReadMesh(m_io_input_file->GetMeshFile(procedure_id), part_names);
+    auto end_mesh_read = std::chrono::high_resolution_clock::now();
+    aperi::CoutP0() << "   Finished Reading Input and Mesh. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_mesh_read - start_mesh_read).count() << " ms" << std::endl;
+    aperi::CoutP0() << " - Setting up for the Solver" << std::endl;
+    auto start_solver_setup = std::chrono::high_resolution_clock::now();
 
     bool uses_generalized_fields = false;
 
     // Loop over parts, create materials, and add parts to force contributions
+    aperi::CoutP0() << "   - Adding parts to force contributions: " << std::endl;
     for (const auto& part : parts) {
         // Create InternalForceContributionParameters
+        aperi::CoutP0() << "      " << part["set"].as<std::string>() << std::endl;
         InternalForceContributionParameters internal_force_contribution_parameters(part, m_io_input_file, m_io_mesh->GetMeshData());
         m_internal_force_contributions.push_back(CreateInternalForceContribution(internal_force_contribution_parameters));
         uses_generalized_fields = internal_force_contribution_parameters.approximation_space_parameters->UsesGeneralizedFields() || uses_generalized_fields;
@@ -69,25 +81,38 @@ void Application::Run(const std::string& input_filename) {
     field_data.insert(field_data.end(), mesh_labeler_field_data.begin(), mesh_labeler_field_data.end());
 
     // Add fields to the mesh and complete initialization
+    aperi::CoutP0() << "   - Adding fields to the mesh and completing initialization" << std::endl;
+    auto start_complete_initialization = std::chrono::high_resolution_clock::now();
     m_io_mesh->AddFields(field_data);
     m_io_mesh->CompleteInitialization();
+    auto end_complete_initialization = std::chrono::high_resolution_clock::now();
+    aperi::CoutP0() << "     Finished adding fields to the mesh and completing initialization. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_complete_initialization - start_complete_initialization).count() << " ms" << std::endl;
 
     // Label the mesh
+    aperi::CoutP0() << "   - Labeling the mesh" << std::endl;
+    auto start_labeling = std::chrono::high_resolution_clock::now();
     for (const auto& part : parts) {
         MeshLabelerParameters mesh_labeler_parameters(part, m_io_mesh->GetMeshData());
         mesh_labeler->LabelPart(mesh_labeler_parameters);
     }
+    auto end_labeling = std::chrono::high_resolution_clock::now();
+    aperi::CoutP0() << "     Finished labeling the mesh. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_labeling - start_labeling).count() << " ms" << std::endl;
 
     // Create the field results file
+    aperi::CoutP0() << "   - Creating the field results file" << std::endl;
+    auto start_field_results_file = std::chrono::high_resolution_clock::now();
     m_io_mesh->CreateFieldResultsFile(m_io_input_file->GetOutputFile(procedure_id), field_data);
+    auto end_field_results_file = std::chrono::high_resolution_clock::now();
+    aperi::CoutP0() << "     Finished creating the field results file. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_field_results_file - start_field_results_file).count() << " ms" << std::endl;
 
     // Get loads
     std::vector<YAML::Node> loads = m_io_input_file->GetLoads(procedure_id);
 
     // Loop over loads and add them to force contributions
+    aperi::CoutP0() << "   - Adding loads to force contributions: " << std::endl;
     for (auto load : loads) {
         auto name = load.begin()->first.as<std::string>();
-        aperi::CoutP0() << "Adding load " << name << " to force contributions" << std::endl;
+        aperi::CoutP0() << "     " << name << std::endl;
         m_external_force_contributions.push_back(CreateExternalForceContribution(load, m_io_mesh->GetMeshData()));
     }
 
@@ -99,9 +124,10 @@ void Application::Run(const std::string& input_filename) {
     std::vector<YAML::Node> boundary_conditions = m_io_input_file->GetBoundaryConditions(procedure_id);
 
     // Loop over boundary conditions and add them to the vector of boundary conditions
+    aperi::CoutP0() << "   - Adding boundary conditions: " << std::endl;
     for (auto boundary_condition : boundary_conditions) {
         auto name = boundary_condition.begin()->first.as<std::string>();
-        aperi::CoutP0() << "Adding boundary condition " << name << " to boundary conditions" << std::endl;
+        aperi::CoutP0() << "      " << name << std::endl;
         m_boundary_conditions.push_back(aperi::CreateBoundaryCondition(boundary_condition, m_io_mesh->GetMeshData()));
     }
 
@@ -116,11 +142,15 @@ void Application::Run(const std::string& input_filename) {
 
     // Create solver
     m_solver = aperi::CreateSolver(m_io_mesh, m_internal_force_contributions, m_external_force_contributions, m_boundary_conditions, time_stepper, output_scheduler);
+    auto end_solver_setup = std::chrono::high_resolution_clock::now();
+    aperi::CoutP0() << "   Finished Setting up for the Solver. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_solver_setup - start_solver_setup).count() << " ms" << std::endl;
 
     // Run solver
-    aperi::CoutP0() << "Starting Solver" << std::endl;
+    aperi::CoutP0() << " - Starting Solver" << std::endl;
+    auto start_solver = std::chrono::high_resolution_clock::now();
     m_solver->Solve();
-    aperi::CoutP0() << "Finished Solver" << std::endl;
+    auto end_solver = std::chrono::high_resolution_clock::now();
+    aperi::CoutP0() << "   Finished Solver. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_solver - start_solver).count() << " ms" << std::endl;
 
     // Finalize
     Finalize();
@@ -130,6 +160,7 @@ void Application::Finalize() {
     MPI_Barrier(m_comm);
     m_io_mesh->Finalize();
     aperi::CoutP0() << "Application Finalized" << std::endl;
+    aperi::CoutP0() << "############################################" << std::endl;
 }
 
 }  // namespace aperi
diff --git a/src/InitialConditionUtil.cpp b/src/InitialConditionUtil.cpp
index 0853dd16..4c6a99b6 100644
--- a/src/InitialConditionUtil.cpp
+++ b/src/InitialConditionUtil.cpp
@@ -51,11 +51,11 @@ void AddInitialConditions(std::vector<YAML::Node>& initial_conditions, const std
         const auto field = initial_condition.begin()->first.as<std::string>() + "_coefficients";
 
         // Loop over sets from initial condition
-        aperi::CoutP0() << "Adding initial condition for sets:" << std::endl;
+        aperi::CoutP0() << "   - Adding initial condition for sets:" << std::endl;
         std::vector<std::string> sets;
         if (initial_condition_node["sets"]) {
             sets = initial_condition_node["sets"].as<std::vector<std::string>>();
-            aperi::CoutP0() << "  " << sets.back() << std::endl;
+            aperi::CoutP0() << "      " << sets.back() << std::endl;
         }
 
         // Set the initial field values
diff --git a/src/MassUtils.cpp b/src/MassUtils.cpp
index 30390df2..511d1499 100644
--- a/src/MassUtils.cpp
+++ b/src/MassUtils.cpp
@@ -154,7 +154,7 @@ double ComputeMassMatrix(const std::shared_ptr<aperi::MeshData> &mesh_data, cons
     // Parallel sum
     double mass_sum_global = node_processor.GetFieldSumHost(0) / 3.0;  // Divide by 3 to get the mass per node as the mass is on the 3 DOFs
     assert(CheckMassSumsAreEqual(mass_sum_global, node_processor.GetFieldSumHost(1) / 3.0));
-    aperi::CoutP0() << "Total Mass for Part " << part_name << ": " << mass_sum_global << std::endl;
+    aperi::CoutP0() << "      " << part_name << ", Mass: " << mass_sum_global << std::endl;
     return mass_sum_global;
 }
 
diff --git a/src/Solver.cpp b/src/Solver.cpp
index 6ec8701d..d70b7e91 100644
--- a/src/Solver.cpp
+++ b/src/Solver.cpp
@@ -214,10 +214,18 @@ void LogEvent(const size_t n, const double time, const double average_runtime, c
 }
 
 double ExplicitSolver::Solve() {
+    // Print the number of nodes
+    size_t num_nodes = mp_mesh_data->GetNumNodes();
+    aperi::CoutP0() << "   - Number of nodes: " << num_nodes << std::endl;
+
     // Compute mass matrix
+    aperi::CoutP0() << "   - Computing mass matrix for parts:" << std::endl;
+    auto start_mass_matrix = std::chrono::high_resolution_clock::now();
     for (const auto &internal_force_contribution : m_internal_force_contributions) {
         ComputeMassMatrix(mp_mesh_data, internal_force_contribution->GetPartName(), internal_force_contribution->GetMaterial()->GetDensity(), internal_force_contribution->UsesGeneralizedFields());
     }
+    auto end_mass_matrix = std::chrono::high_resolution_clock::now();
+    aperi::CoutP0() << "     Finished Computing Mass Matrix. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_mass_matrix - start_mass_matrix).count() << " ms" << std::endl;
 
     // Create node processors for each step of the time integration algorithm
     // The node processors are used to loop over the degrees of freedom (dofs) of the mesh and apply the time integration algorithm to each dof
@@ -245,11 +253,9 @@ double ExplicitSolver::Solve() {
     double total_runtime = 0.0;
     double average_runtime = 0.0;
 
-    // Print the number of nodes
-    size_t num_nodes = mp_mesh_data->GetNumNodes();
-    aperi::CoutP0() << "Number of Nodes: " << num_nodes << std::endl;
-
     // Print the table header before the loop
+    aperi::CoutP0() << std::endl
+                    << "Marching through time steps:" << std::endl;
     LogHeader();
 
     // Create a scheduler for logging, outputting every 2 seconds. TODO(jake): Make this configurable in input file
diff --git a/src/main.cpp b/src/main.cpp
index deb5f6e9..bd837f51 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -66,6 +66,9 @@ int main(int argc, char* argv[]) {
     // Print header and number of processes
     PrintHeader();
     aperi::CoutP0() << "Running on " << size << " processes." << std::endl;
+    aperi::CoutP0() << "Date: " << __DATE__ << std::endl;
+    aperi::CoutP0() << "Time: " << __TIME__ << std::endl
+                    << std::endl;
 
     // Check if input filename is provided as a command-line argument
     if (argc < 2) {
@@ -81,6 +84,10 @@ int main(int argc, char* argv[]) {
     RunApplication(input_filename, p_comm);
 
     aperi::CoutP0() << "aperi-mech finished successfully!" << std::endl;
+    aperi::CoutP0() << "Date: " << __DATE__ << std::endl;
+    aperi::CoutP0() << "Time: " << __TIME__ << std::endl
+                    << std::endl;
+    aperi::CoutP0() << "############################################" << std::endl;
 
     // Finalize Kokkos and MPI
     Kokkos::finalize();

From ac98a143f588b8cab9c716bdd9d644ebbcd286e7 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Sun, 6 Oct 2024 08:01:10 -0600
Subject: [PATCH 02/13] add time tracker for updating field states

---
 src/Solver.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Solver.cpp b/src/Solver.cpp
index d70b7e91..da78354f 100644
--- a/src/Solver.cpp
+++ b/src/Solver.cpp
@@ -252,6 +252,8 @@ double ExplicitSolver::Solve() {
     // Initialize total runtime, average runtime, for benchmarking
     double total_runtime = 0.0;
     double average_runtime = 0.0;
+    double total_update_field_states_runtime = 0.0;
+    double average_update_field_states_runtime = 0.0;
 
     // Print the table header before the loop
     aperi::CoutP0() << std::endl
@@ -279,6 +281,10 @@ double ExplicitSolver::Solve() {
 
         // Move state n+1 to state n
         UpdateFieldStates();
+        auto end_update_field_states = std::chrono::high_resolution_clock::now();
+        std::chrono::duration<double> update_field_states_runtime = end_update_field_states - start_time;
+        total_update_field_states_runtime += update_field_states_runtime.count();
+        average_update_field_states_runtime = total_update_field_states_runtime / n;
 
         double half_time_increment = 0.5 * time_increment;
         double time_midstep = time + half_time_increment;
@@ -335,6 +341,7 @@ double ExplicitSolver::Solve() {
     }
     LogEvent(n, time, average_runtime, "End of Simulation");
     LogFooter();
+    aperi::CoutP0() << "   - Average Update Field States Runtime: " << average_update_field_states_runtime << " seconds" << std::endl;
 
     return average_runtime;
 }

From c87a66e4cf213275759237ae919d331c435904c1 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Sun, 6 Oct 2024 12:17:13 -0600
Subject: [PATCH 03/13] add scaling tests to performance tests. leave them
 disabled so they can be run manually.

---
 ...reApproximationFunctionPerformanceTest.cpp | 210 +++++++-----
 .../gtests/SolverPerformanceTest.cpp          | 324 ++++++++++++------
 2 files changed, 336 insertions(+), 198 deletions(-)

diff --git a/test/performance_tests/gtests/CompadreApproximationFunctionPerformanceTest.cpp b/test/performance_tests/gtests/CompadreApproximationFunctionPerformanceTest.cpp
index fb88fe9a..9136d7e6 100644
--- a/test/performance_tests/gtests/CompadreApproximationFunctionPerformanceTest.cpp
+++ b/test/performance_tests/gtests/CompadreApproximationFunctionPerformanceTest.cpp
@@ -7,7 +7,105 @@
 
 #include "CompadreApproximationFunctionTestFixture.h"
 
-TEST_F(CompadreApproximationFunctionTest, PerformanceBenchmark) {
+class CompadreApproximationFunctionPerformanceTest : public CompadreApproximationFunctionTest {
+   public:
+    void RunStandardPerformanceTest(size_t num_procs, bool using_gpu, const std::string &mode, size_t num_refinements, size_t initial_num_elem_x, size_t initial_num_elem_y, size_t initial_num_elem_z, size_t refinement_factor, std::string json_prefix) {
+        // Vectors to store the number of nodes and runtimes
+        std::vector<double> num_nodes;
+        std::vector<std::pair<double, double>> runtimes;
+
+        // Mesh size vectors
+        std::vector<size_t> num_elem_x;
+        std::vector<size_t> num_elem_y;
+        std::vector<size_t> num_elem_z;
+
+        num_elem_x.push_back(initial_num_elem_x);
+        num_elem_y.push_back(initial_num_elem_y);
+        num_elem_z.push_back(initial_num_elem_z);
+
+        // Calculate the number of nodes
+        num_nodes.push_back((num_elem_x.back() + 1) * (num_elem_y.back() + 1) * (num_elem_z.back() + 1));
+
+        for (size_t i = 1; i < num_refinements; ++i) {
+            // Next refinement mesh
+            num_elem_x.push_back(num_elem_x.back() * refinement_factor);
+            num_elem_y.push_back(num_elem_y.back() * refinement_factor);
+            num_elem_z.push_back(num_elem_z.back() * refinement_factor);
+            num_nodes.push_back((num_elem_x.back() + 1) * (num_elem_y.back() + 1) * (num_elem_z.back() + 1));
+        }
+
+        // Print the run settings before running
+        std::cout << "Initial mesh size: " << initial_num_elem_x << "x" << initial_num_elem_y << "x" << initial_num_elem_z << std::endl;
+        std::cout << "Refinement factor: " << refinement_factor << std::endl;
+        std::cout << "Number of refinements: " << num_refinements << std::endl;
+        std::cout << "--------------------------------------------------------------------------------" << std::endl;
+        std::cout << std::setw(20) << "Mesh size" << std::setw(20) << "Number of nodes" << std::setw(20) << std::endl;
+        for (size_t i = 0; i < num_refinements; ++i) {
+            std::cout << std::setw(20) << std::to_string(num_elem_x[i]) + "x" + std::to_string(num_elem_y[i]) + "x" + std::to_string(num_elem_z[i])
+                      << std::setw(20) << num_nodes[i] << std::endl;
+        }
+        std::cout << "--------------------------------------------------------------------------------" << std::endl;
+
+        // Create the json file
+        std::string test_suite_name = ::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name();
+        std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+        std::string full_test_name = test_suite_name + "_" + test_name;
+
+        std::string run_specs = "_" + mode + "_procs_" + std::to_string(num_procs) + (using_gpu ? "_gpu" : "_cpu");
+        std::ofstream json_file(json_prefix + "_gtest_" + full_test_name + run_specs + ".json");
+        json_file << "[" << std::endl;
+
+        for (size_t i = 0; i < num_refinements; ++i) {
+            // Set the kernel factor
+            m_kernel_factor = 1.1;
+
+            // Create the next refinement
+            m_num_elements_x = num_elem_x[i];
+            m_num_elements_y = num_elem_y[i];
+            m_num_elements_z = num_elem_z[i];
+
+            // Create the mesh and processors
+            double time_to_create_mesh_and_processors = CreateMeshAndProcessors(m_num_elements_x, m_num_elements_y, m_num_elements_z, "");  // Hex mesh
+            aperi::CoutP0() << "  Runtime, Create mesh and processors: " << time_to_create_mesh_and_processors << " s" << std::endl;
+
+            // Make sure the coordinates are not on a grid
+            // RandomizeCoordinates(*m_mesh_data, -0.05, 0.05);
+
+            // Test building approximation functions
+            std::map<std::string, double> this_runtimes = TestBuildingApproximationFunctions();
+
+            // Print the runtimes
+            for (auto &runtime : this_runtimes) {
+                aperi::CoutP0() << "  Runtime, " << runtime.first << ": " << runtime.second << " s" << std::endl;
+            }
+            // Name of the benchmark: pair.first: num_procs processors, cpu/gpu, hostname, num_elem_x x num_elem_y x num_elem_z elements, runtime"
+            std::string name_end = std::to_string(num_procs) + " processors, " + (using_gpu ? "gpu" : "cpu") + ", " + std::to_string(num_elem_x[i]) + " x " + std::to_string(num_elem_y[i]) + " x " + std::to_string(num_elem_z[i]) + " elements, runtime";
+            std::string unit = "seconds";
+            int i_runtimes = 0;
+            for (auto &runtime : this_runtimes) {
+                // Output the results to a json file
+                if (i != 0 || i_runtimes != 0) {
+                    json_file << "  }," << std::endl;  // close the previous benchmark
+                }
+                i_runtimes++;
+                json_file << "  {" << std::endl;
+                json_file << R"(    "name": ")" << runtime.first << ": " << name_end << R"(",)" << std::endl;
+                json_file << R"(    "unit": ")" << unit << R"(",)" << std::endl;
+                json_file << R"(    "value": )" << runtime.second << std::endl;
+            }
+
+            // Setup for the next refinement
+            ResetCompadreApproximationFunction();
+        }
+
+        // Close the json file
+        json_file << "  }" << std::endl;  // close the last benchmark
+        json_file << "]" << std::endl;    // close the json file
+        json_file.close();
+    }
+};
+
+TEST_F(CompadreApproximationFunctionPerformanceTest, BenchmarkFunctionComputations) {
     bool using_gpu = Kokkos::DefaultExecutionSpace::concurrency() > 1;
 
     int num_procs;
@@ -34,97 +132,33 @@ TEST_F(CompadreApproximationFunctionTest, PerformanceBenchmark) {
         num_refinements = 1;  // Do less refinements in debug mode as it takes longer
     }
 #endif
+    RunStandardPerformanceTest(num_procs, using_gpu, mode, num_refinements, initial_num_elem_x, initial_num_elem_y, initial_num_elem_z, refinement_factor, "performance");
+}
 
-    // Vectors to store the number of nodes and runtimes
-    std::vector<double> num_nodes;
-    std::vector<std::pair<double, double>> runtimes;
-
-    // Mesh size vectors
-    std::vector<size_t> num_elem_x;
-    std::vector<size_t> num_elem_y;
-    std::vector<size_t> num_elem_z;
-
-    num_elem_x.push_back(initial_num_elem_x);
-    num_elem_y.push_back(initial_num_elem_y);
-    num_elem_z.push_back(initial_num_elem_z);
-
-    // Calculate the number of nodes
-    num_nodes.push_back((num_elem_x.back() + 1) * (num_elem_y.back() + 1) * (num_elem_z.back() + 1));
-
-    for (size_t i = 1; i < num_refinements; ++i) {
-        // Next refinement mesh
-        num_elem_x.push_back(num_elem_x.back() * refinement_factor);
-        num_elem_y.push_back(num_elem_y.back() * refinement_factor);
-        num_elem_z.push_back(num_elem_z.back() * refinement_factor);
-        num_nodes.push_back((num_elem_x.back() + 1) * (num_elem_y.back() + 1) * (num_elem_z.back() + 1));
-    }
+TEST_F(CompadreApproximationFunctionPerformanceTest, DISABLED_BenchmarkFunctionComputationsScaling) {
+#ifndef NDEBUG
+    // Skip this test in debug mode
+    GTEST_SKIP_("Test only runs in release mode.");
+#endif
 
-    // Print the run settings before running
-    std::cout << "Initial mesh size: " << initial_num_elem_x << "x" << initial_num_elem_y << "x" << initial_num_elem_z << std::endl;
-    std::cout << "Refinement factor: " << refinement_factor << std::endl;
-    std::cout << "Number of refinements: " << num_refinements << std::endl;
-    std::cout << "--------------------------------------------------------------------------------" << std::endl;
-    std::cout << std::setw(20) << "Mesh size" << std::setw(20) << "Number of nodes" << std::setw(20) << std::endl;
-    for (size_t i = 0; i < num_refinements; ++i) {
-        std::cout << std::setw(20) << std::to_string(num_elem_x[i]) + "x" + std::to_string(num_elem_y[i]) + "x" + std::to_string(num_elem_z[i])
-                  << std::setw(20) << num_nodes[i] << std::endl;
+    int num_procs;
+    MPI_Comm_size(MPI_COMM_WORLD, &num_procs);
+    // TODO(jake): This test is not working with more than 1 process
+    if (num_procs > 1) {
+        GTEST_SKIP_("Test only runs with 1 or fewer processes.");
     }
-    std::cout << "--------------------------------------------------------------------------------" << std::endl;
-
-    // Create the json file
-    std::string test_suite_name = ::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name();
-    std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
-    std::string full_test_name = test_suite_name + "_" + test_name;
-
-    std::string run_specs = "_" + mode + "_procs_" + std::to_string(num_procs) + (using_gpu ? "_gpu" : "_cpu");
-    std::ofstream json_file("performance_gtest_" + full_test_name + run_specs + ".json");
-    json_file << "[" << std::endl;
-
-    for (size_t i = 0; i < num_refinements; ++i) {
-        // Set the kernel factor
-        m_kernel_factor = 1.1;
-
-        // Create the next refinement
-        m_num_elements_x = num_elem_x[i];
-        m_num_elements_y = num_elem_y[i];
-        m_num_elements_z = num_elem_z[i];
 
-        // Create the mesh and processors
-        double time_to_create_mesh_and_processors = CreateMeshAndProcessors(m_num_elements_x, m_num_elements_y, m_num_elements_z, "");  // Hex mesh
-        aperi::CoutP0() << "  Runtime, Create mesh and processors: " << time_to_create_mesh_and_processors << " s" << std::endl;
-
-        // Make sure the coordinates are not on a grid
-        // RandomizeCoordinates(*m_mesh_data, -0.05, 0.05);
-
-        // Test building approximation functions
-        std::map<std::string, double> this_runtimes = TestBuildingApproximationFunctions();
+    // Number of refinements and refinement factor
+    size_t num_refinements = 6;
+    size_t refinement_factor = 2;
 
-        // Print the runtimes
-        for (auto &runtime : this_runtimes) {
-            aperi::CoutP0() << "  Runtime, " << runtime.first << ": " << runtime.second << " s" << std::endl;
-        }
-        // Name of the benchmark: pair.first: num_procs processors, cpu/gpu, hostname, num_elem_x x num_elem_y x num_elem_z elements, runtime"
-        std::string name_end = std::to_string(num_procs) + " processors, " + (using_gpu ? "gpu" : "cpu") + ", " + std::to_string(num_elem_x[i]) + " x " + std::to_string(num_elem_y[i]) + " x " + std::to_string(num_elem_z[i]) + " elements, runtime";
-        std::string unit = "seconds";
-        int i_runtimes = 0;
-        for (auto &runtime : this_runtimes) {
-            // Output the results to a json file
-            if (i != 0 || i_runtimes != 0) {
-                json_file << "  }," << std::endl;  // close the previous benchmark
-            }
-            i_runtimes++;
-            json_file << "  {" << std::endl;
-            json_file << R"(    "name": ")" << runtime.first << ": " << name_end << R"(",)" << std::endl;
-            json_file << R"(    "unit": ")" << unit << R"(",)" << std::endl;
-            json_file << R"(    "value": )" << runtime.second << std::endl;
-        }
+    // Initial mesh size
+    size_t initial_num_elem_x = 6;
+    size_t initial_num_elem_y = 6;
+    size_t initial_num_elem_z = 18;
 
-        // Setup for the next refinement
-        ResetCompadreApproximationFunction();
-    }
+    std::string mode = "release";
 
-    // Close the json file
-    json_file << "  }" << std::endl;  // close the last benchmark
-    json_file << "]" << std::endl;    // close the json file
-    json_file.close();
-}
+    bool using_gpu = Kokkos::DefaultExecutionSpace::concurrency() > 1;
+    RunStandardPerformanceTest(num_procs, using_gpu, mode, num_refinements, initial_num_elem_x, initial_num_elem_y, initial_num_elem_z, refinement_factor, "scaling");
+}
\ No newline at end of file
diff --git a/test/performance_tests/gtests/SolverPerformanceTest.cpp b/test/performance_tests/gtests/SolverPerformanceTest.cpp
index 698ab56b..a0590512 100644
--- a/test/performance_tests/gtests/SolverPerformanceTest.cpp
+++ b/test/performance_tests/gtests/SolverPerformanceTest.cpp
@@ -12,7 +12,7 @@
 #include "UnitTestUtils.h"
 
 // Create the Taylor impact test
-YAML::Node CreateTaylorImpactYaml(double time_increment, double end_time, size_t /*num_elem_x*/ = 10, size_t /*num_elem_y*/ = 10, size_t /*num_elem_z*/ = 30) {
+YAML::Node CreateTaylorImpactYaml(bool reproducing_kernel, double time_increment, double end_time, size_t /*num_elem_x*/ = 10, size_t /*num_elem_y*/ = 10, size_t /*num_elem_z*/ = 30) {
     // Start with the basic explicit test
     YAML::Node yaml_data = CreateTestYaml();
 
@@ -44,142 +44,246 @@ YAML::Node CreateTaylorImpactYaml(double time_increment, double end_time, size_t
     // Effectively turn off output
     yaml_data["procedures"][0]["explicit_dynamics_procedure"]["output"]["time_increment"] = end_time;
 
+    // Change the approximation space to reproducing_kernel
+    if (reproducing_kernel) {
+        // Remove the finite element formulation
+        yaml_data["procedures"][0]["explicit_dynamics_procedure"]["geometry"]["parts"][0]["part"]["formulation"].remove("approximation_space");
+        // Add the reproducing_kernel formulation
+        yaml_data["procedures"][0]["explicit_dynamics_procedure"]["geometry"]["parts"][0]["part"]["formulation"]["approximation_space"]["reproducing_kernel"]["kernel_radius_scale_factor"] = 1.1;
+
+        // Remove the integration scheme
+        yaml_data["procedures"][0]["explicit_dynamics_procedure"]["geometry"]["parts"][0]["part"]["formulation"].remove("integration_scheme");
+
+        // Add the strain smoothing integration scheme
+        yaml_data["procedures"][0]["explicit_dynamics_procedure"]["geometry"]["parts"][0]["part"]["formulation"]["integration_scheme"]["strain_smoothing"]["element_smoothing_cell"]["subdomains"] = 1.0;
+    }
+
     return yaml_data;
 }
 
-// Test a large, square cross section, taylor impact test
-TEST_F(SolverTest, BenchmarkTaylorImpact) {
-    StopCapturingOutput();
+class SolverPerformanceTest : public SolverTest {
+   protected:
+    void SetUp() override {
+        // Run SolverTest::SetUp first
+        SolverTest::SetUp();
+    }
 
-    bool using_gpu = Kokkos::DefaultExecutionSpace::concurrency() > 1;
+    void TearDown() override {
+        // Run SolverTest::TearDown last
+        SolverTest::TearDown();
+    }
 
-    // Desired runtime
-    double runtime = 10.0;
+    void RunTaylorImpactTest(bool reproducing_kernel, double runtime, size_t num_refinements, std::vector<size_t> refinement_factors, size_t initial_num_elem_x, size_t initial_num_elem_y, size_t initial_num_elem_z, bool using_gpu, std::string mode, const std::vector<double> &gold_runtimes, const std::string &json_prefix = "performance") {
+        // The time increment
+        double time_increment = 0.01;
 
-    // The time increment
-    double time_increment = 0.01;
+        // Vectors to store the number of nodes and runtimes
+        std::vector<double> num_nodes;
 
-    // Number of refinements and refinement factor
-    size_t num_refinements = 3;
-    size_t refinement_factor = 2;
+        // Mesh size vectors
+        std::vector<size_t> num_elem_x;
+        std::vector<size_t> num_elem_y;
+        std::vector<size_t> num_elem_z;
 
-    // Initial mesh size
-    size_t initial_num_elem_x = 7;
-    size_t initial_num_elem_y = 7;
-    size_t initial_num_elem_z = 21;
+        num_elem_x.push_back(initial_num_elem_x);
+        num_elem_y.push_back(initial_num_elem_y);
+        num_elem_z.push_back(initial_num_elem_z);
 
-    std::string mode = "release";
-    // Golds just used for approximating the number of steps to give the desired runtime
-    std::vector<double> gold_runtimes = {1.24661e-03, 9.811962e-03, 7.899994e-02};  // release, cpu
-#ifndef NDEBUG
-    mode = "debug";
-    // Allow for a longer runtime in debug mode on cpu, do less refinements
-    if (!using_gpu) {
-        runtime = 50.0;
-        num_refinements = 2;
-        gold_runtimes = {3.25e-01, 2.6};  // debug, cpu
-    } else {
-        gold_runtimes = {1.195e-03, 1.32e-03, 3.3e-03};  // debug, gpu
-    }
-#endif
-    if (using_gpu && mode == "release") {
-        gold_runtimes = {8.8e-04, 1.0e-03, 3.0e-03};  // release, gpu
-    }
+        // Calculate the number of nodes
+        num_nodes.push_back((num_elem_x.back() + 1) * (num_elem_y.back() + 1) * (num_elem_z.back() + 1));
 
-    // Vectors to store the number of nodes and runtimes
-    std::vector<double> num_nodes;
+        // Calculate the number of steps and end times
+        std::vector<size_t> num_steps;
+        std::vector<double> end_times;
 
-    // Mesh size vectors
-    std::vector<size_t> num_elem_x;
-    std::vector<size_t> num_elem_y;
-    std::vector<size_t> num_elem_z;
+        // Test node-steps
+        num_steps.push_back(runtime / gold_runtimes[0]);
+        end_times.push_back(time_increment * num_steps.back());
 
-    num_elem_x.push_back(initial_num_elem_x);
-    num_elem_y.push_back(initial_num_elem_y);
-    num_elem_z.push_back(initial_num_elem_z);
+        for (size_t i = 1; i < num_refinements; ++i) {
+            // Next refinement mesh
+            size_t num_elem_x_next = num_elem_x.back() * refinement_factors[0];
+            size_t num_elem_y_next = num_elem_y.back() * refinement_factors[1];
+            size_t num_elem_z_next = num_elem_z.back() * refinement_factors[2];
+
+            // If the number of elements is too large (greater than 36 million), stop the refinement
+            if ((num_elem_x_next) * (num_elem_y_next) * (num_elem_z_next) > 36e6) {
+                aperi::CoutP0() << "Number of elements is too large. Stopping the refinement. Number of elements: " << (num_elem_x_next) * (num_elem_y_next) * (num_elem_z_next) << std::endl;
+                num_refinements = i;
+                break;
+            }
+
+            num_elem_x.push_back(num_elem_x_next);
+            num_elem_y.push_back(num_elem_y_next);
+            num_elem_z.push_back(num_elem_z_next);
+            num_nodes.push_back((num_elem_x_next + 1) * (num_elem_y_next + 1) * (num_elem_z_next + 1));
+            num_steps.push_back(runtime / gold_runtimes[i]);
+            end_times.push_back(time_increment * num_steps.back());
+        }
 
-    // Calculate the number of nodes
-    num_nodes.push_back((num_elem_x.back() + 1) * (num_elem_y.back() + 1) * (num_elem_z.back() + 1));
+        // Print the run settings before running
+        aperi::CoutP0() << "Desired runtime(s): " << runtime << std::endl;
+        aperi::CoutP0() << "Time increment: " << time_increment << std::endl;
+        aperi::CoutP0() << "Initial mesh size: " << initial_num_elem_x << "x" << initial_num_elem_y << "x" << initial_num_elem_z << std::endl;
+        aperi::CoutP0() << "Refinement factors: " << refinement_factors[0] << "x" << refinement_factors[1] << "x" << refinement_factors[2] << std::endl;
+        aperi::CoutP0() << "Number of refinements: " << num_refinements << std::endl;
+        aperi::CoutP0() << "--------------------------------------------------------------------------------" << std::endl;
+        aperi::CoutP0() << std::setw(20) << "Mesh size" << std::setw(20) << "Number of nodes" << std::setw(20) << "Number of steps" << std::setw(20) << "End time" << std::endl;
+        for (size_t i = 0; i < num_refinements; ++i) {
+            aperi::CoutP0() << std::setw(20) << std::to_string(num_elem_x[i]) + "x" + std::to_string(num_elem_y[i]) + "x" + std::to_string(num_elem_z[i])
+                            << std::setw(20) << num_nodes[i]
+                            << std::setw(20) << num_steps[i]
+                            << std::setw(20) << end_times[i] << std::endl;
+        }
+        aperi::CoutP0() << "--------------------------------------------------------------------------------" << std::endl;
 
-    // Calculate the number of steps and end times
-    std::vector<size_t> num_steps;
-    std::vector<double> end_times;
+        for (size_t i = 0; i < num_refinements; ++i) {
+            ASSERT_GT(static_cast<int>(num_steps.back()), 3) << "Number of steps is too small. Adjust the parameters.";
+        }
 
-    // Test node-steps
-    num_steps.push_back(runtime / gold_runtimes[0]);
-    end_times.push_back(time_increment * num_steps.back());
+        // Create a json file with the benchmark results
+        std::string test_suite_name = ::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name();
+        std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+        std::string full_test_name = test_suite_name + "_" + test_name;
+
+        std::string run_specs = "_" + mode + "_procs_" + std::to_string(m_num_procs) + (using_gpu ? "_gpu" : "_cpu");
+        std::ofstream json_file(json_prefix + "_gtest_" + full_test_name + run_specs + ".json");
+        json_file << "[" << std::endl;
+
+        for (size_t i = 0; i < num_refinements; ++i) {
+            // Create the next refinement
+            m_yaml_data = CreateTaylorImpactYaml(reproducing_kernel, time_increment, end_times[i], num_elem_x[i], num_elem_y[i], num_elem_z[i]);
+            CreateInputFile();
+            CreateTestMesh(std::to_string(num_elem_x[i]) + "x" + std::to_string(num_elem_y[i]) + "x" + std::to_string(num_elem_z[i]) + "|sideset:z|tets");
+
+            // Run the solver
+            double average_increment_runtime = RunSolver();
+
+            // Print the results
+            aperi::CoutP0() << "  Runtime (s) / increment:      " << std::scientific << average_increment_runtime << std::endl;
+
+            // Output the results to a json file
+            if (i != 0) {
+                json_file << "  },";  // close the previous benchmark
+            }
+            json_file << "  {" << std::endl;
+            // Name of the benchmark: Taylor Impact: m_num_procs processors, cpu/gpu, num_elem_x x num_elem_y x num_elem_z elements, runtime per increment"
+            std::string name = "Taylor Impact: " + std::to_string(m_num_procs) + " processors, " + (using_gpu ? "gpu" : "cpu") + ", " + std::to_string(num_elem_x[i]) + " x " + std::to_string(num_elem_y[i]) + " x " + std::to_string(num_elem_z[i]) + " elements, runtime per increment";
+            json_file << R"(    "name": ")" << name << R"(",)" << std::endl;
+            // Unit of the benchmark
+            std::string unit = "milliseconds";
+            json_file << R"(    "unit": ")" << unit << R"(",)" << std::endl;
+            // Value of the benchmark
+            double value = average_increment_runtime * 1000.0;
+            json_file << R"(    "value": )" << value << std::endl;
+
+            // Setup for the next refinement
+            ResetSolverTest();
+        }
 
-    for (size_t i = 1; i < num_refinements; ++i) {
-        // Next refinement mesh
-        num_elem_x.push_back(num_elem_x.back() * refinement_factor);
-        num_elem_y.push_back(num_elem_y.back() * refinement_factor);
-        num_elem_z.push_back(num_elem_z.back() * refinement_factor);
-        num_nodes.push_back((num_elem_x.back() + 1) * (num_elem_y.back() + 1) * (num_elem_z.back() + 1));
-        num_steps.push_back(runtime / gold_runtimes[i]);
-        end_times.push_back(time_increment * num_steps.back());
+        json_file << "  }" << std::endl;  // close the last benchmark
+        json_file << "]" << std::endl;    // close the json file
+        json_file.close();
     }
 
-    // Print the run settings before running
-    aperi::CoutP0() << "Desired runtime(s): " << runtime << std::endl;
-    aperi::CoutP0() << "Time increment: " << time_increment << std::endl;
-    aperi::CoutP0() << "Initial mesh size: " << initial_num_elem_x << "x" << initial_num_elem_y << "x" << initial_num_elem_z << std::endl;
-    aperi::CoutP0() << "Refinement factor: " << refinement_factor << std::endl;
-    aperi::CoutP0() << "Number of refinements: " << num_refinements << std::endl;
-    aperi::CoutP0() << "--------------------------------------------------------------------------------" << std::endl;
-    aperi::CoutP0() << std::setw(20) << "Mesh size" << std::setw(20) << "Number of nodes" << std::setw(20) << "Number of steps" << std::setw(20) << "End time" << std::endl;
-    for (size_t i = 0; i < num_refinements; ++i) {
-        aperi::CoutP0() << std::setw(20) << std::to_string(num_elem_x[i]) + "x" + std::to_string(num_elem_y[i]) + "x" + std::to_string(num_elem_z[i])
-                        << std::setw(20) << num_nodes[i]
-                        << std::setw(20) << num_steps[i]
-                        << std::setw(20) << end_times[i] << std::endl;
-    }
-    aperi::CoutP0() << "--------------------------------------------------------------------------------" << std::endl;
+    void RunStandardPerformanceTest(bool reproducing_kernel = false) {
+        StopCapturingOutput();
+
+        bool using_gpu = Kokkos::DefaultExecutionSpace::concurrency() > 1;
 
-    for (size_t i = 0; i < num_refinements; ++i) {
-        ASSERT_GT(static_cast<int>(num_steps.back()), 3) << "Number of steps is too small. Adjust the parameters.";
+        // Desired runtime
+        double runtime = 10.0;
+
+        // Number of refinements and refinement factor
+        size_t num_refinements = 3;
+        std::vector<size_t> refinement_factors = {2, 2, 2};
+
+        // Initial mesh size
+        size_t initial_num_elem_x = 7;
+        size_t initial_num_elem_y = 7;
+        size_t initial_num_elem_z = 21;
+
+        std::string mode = "release";
+        // Golds just used for approximating the number of steps to give the desired runtime
+        std::vector<double> gold_runtimes = {1.24661e-03, 9.811962e-03, 7.899994e-02};  // release, cpu
+#ifndef NDEBUG
+        mode = "debug";
+        // Allow for a longer runtime in debug mode on cpu, do less refinements
+        if (!using_gpu) {
+            runtime = 50.0;
+            num_refinements = 2;
+            gold_runtimes = {3.25e-01, 2.6};  // debug, cpu
+        } else {
+            gold_runtimes = {1.195e-03, 1.32e-03, 3.3e-03};  // debug, gpu
+        }
+#endif
+        if (using_gpu && mode == "release") {
+            gold_runtimes = {8.8e-04, 1.0e-03, 3.0e-03};  // release, gpu
+        }
+
+        RunTaylorImpactTest(reproducing_kernel, runtime, num_refinements, refinement_factors, initial_num_elem_x, initial_num_elem_y, initial_num_elem_z, using_gpu, mode, gold_runtimes, "performance");
     }
 
-    // Create a json file with the benchmark results
-    std::string test_suite_name = ::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name();
-    std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
-    std::string full_test_name = test_suite_name + "_" + test_name;
+    void RunScalingTest(bool reproducing_kernel = false) {
+        StopCapturingOutput();
+#ifndef NDEBUG
+        // Skip this test in debug mode
+        aperture::CoutP0() << "Skipping scaling test in debug mode. If this is needed then the timing estimates should be adjusted." << std::endl;
+        return;
+#endif
+
+        bool using_gpu = Kokkos::DefaultExecutionSpace::concurrency() > 1;
+
+        // Desired runtime
+        double runtime = 20.0;
+
+        // Number of refinements and refinement factor
+        size_t num_refinements = 10;
+        std::vector<size_t> refinement_factors = {2, 1, 1};
 
-    std::string run_specs = "_" + mode + "_procs_" + std::to_string(m_num_procs) + (using_gpu ? "_gpu" : "_cpu");
-    std::ofstream json_file("performance_gtest_" + full_test_name + run_specs + ".json");
-    json_file << "[" << std::endl;
+        // Initial mesh size
+        size_t initial_num_elem_x = std::pow(2, m_num_procs - 1);
+        size_t initial_num_elem_y = 32;
+        size_t initial_num_elem_z = 256;
 
-    for (size_t i = 0; i < num_refinements; ++i) {
-        // Create the next refinement
-        m_yaml_data = CreateTaylorImpactYaml(time_increment, end_times[i], num_elem_x[i], num_elem_y[i], num_elem_z[i]);
-        CreateInputFile();
-        CreateTestMesh(std::to_string(num_elem_x[i]) + "x" + std::to_string(num_elem_y[i]) + "x" + std::to_string(num_elem_z[i]) + "|sideset:z|tets");
+        std::string mode = "release";
 
-        // Run the solver
-        double average_increment_runtime = RunSolver();
+        double base_time_per_element_cpu = 0.7e-06;
+        double base_time_per_element_gpu = 5.2e-09;
 
-        // Print the results
-        aperi::CoutP0() << "  Runtime (s) / increment:      " << std::scientific << average_increment_runtime << std::endl;
+        size_t starting_num_elements_per_proc = initial_num_elem_x * initial_num_elem_y * initial_num_elem_z / m_num_procs;
+        double base_time_per_element = using_gpu ? base_time_per_element_gpu : base_time_per_element_cpu;
 
-        // Output the results to a json file
-        if (i != 0) {
-            json_file << "  },";  // close the previous benchmark
+        std::vector<double> gold_runtimes;
+        for (size_t i = 0; i < num_refinements; ++i) {
+            size_t num_elements = starting_num_elements_per_proc * std::pow(refinement_factors[0], i) * std::pow(refinement_factors[1], i) * std::pow(refinement_factors[2], i);
+            gold_runtimes.push_back(base_time_per_element * num_elements);
         }
-        json_file << "  {" << std::endl;
-        // Name of the benchmark: Taylor Impact: m_num_procs processors, cpu/gpu, num_elem_x x num_elem_y x num_elem_z elements, runtime per increment"
-        std::string name = "Taylor Impact: " + std::to_string(m_num_procs) + " processors, " + (using_gpu ? "gpu" : "cpu") + ", " + std::to_string(num_elem_x[i]) + " x " + std::to_string(num_elem_y[i]) + " x " + std::to_string(num_elem_z[i]) + " elements, runtime per increment";
-        json_file << R"(    "name": ")" << name << R"(",)" << std::endl;
-        // Unit of the benchmark
-        std::string unit = "milliseconds";
-        json_file << R"(    "unit": ")" << unit << R"(",)" << std::endl;
-        // Value of the benchmark
-        double value = average_increment_runtime * 1000.0;
-        json_file << R"(    "value": )" << value << std::endl;
-
-        // Setup for the next refinement
-        ResetSolverTest();
+
+        RunTaylorImpactTest(reproducing_kernel, runtime, num_refinements, refinement_factors, initial_num_elem_x, initial_num_elem_y, initial_num_elem_z, using_gpu, mode, gold_runtimes, "scaling");
     }
+};
+
+// Test a large, square cross section, taylor impact test, FEM
+TEST_F(SolverPerformanceTest, BenchmarkTaylorImpact) {
+    bool reproducing_kernel = false;
+    RunStandardPerformanceTest(reproducing_kernel);
+}
+
+// Test a large, square cross section, taylor impact test, reproducing kernel
+TEST_F(SolverPerformanceTest, BenchmarkTaylorImpactReproducingKernel) {
+    bool reproducing_kernel = true;
+    RunStandardPerformanceTest(reproducing_kernel);
+}
+
+// Test a large, square cross section, taylor impact test, FEM scaling
+TEST_F(SolverPerformanceTest, DISABLED_BenchmarkTaylorImpactScaling) {
+    bool reproducing_kernel = false;
+    RunScalingTest(reproducing_kernel);
+}
 
-    json_file << "  }" << std::endl;  // close the last benchmark
-    json_file << "]" << std::endl;    // close the json file
-    json_file.close();
+// Test a large, square cross section, taylor impact test, reproducing kernel scaling
+TEST_F(SolverPerformanceTest, DISABLED_BenchmarkTaylorImpactReproducingKernelScaling) {
+    bool reproducing_kernel = true;
+    RunScalingTest(reproducing_kernel);
 }

From 5b177acccf5f68e4703a58c31bf06e8da8e9c3bc Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Sun, 6 Oct 2024 14:16:17 -0600
Subject: [PATCH 04/13] up integer size in IoMesh to 8 to allow bigger meshes

---
 include/IoMesh.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/IoMesh.h b/include/IoMesh.h
index eb52b586..665405e0 100644
--- a/include/IoMesh.h
+++ b/include/IoMesh.h
@@ -20,7 +20,7 @@ struct IoMeshParameters {
     int compression_level = 0;              // compression level [1..9] to use
     bool compression_shuffle = false;       // use shuffle filter prior to compressing data: true|false
     bool lower_case_variable_names = true;  // convert variable names to lowercase and replace spaces in names with underscore
-    int integer_size = 4;                   // use 4 or 8-byte integers for input and output
+    int integer_size = 8;                   // use 4 or 8-byte integers for input and output
     int initial_bucket_capacity = 0;
     int maximum_bucket_capacity = 0;
 };

From fbaf5f0e6c555d724fdc5a17c184c8317a915069 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Sun, 6 Oct 2024 22:10:24 +0000
Subject: [PATCH 05/13] adjust scaling test sizes. were getting too large

---
 test/performance_tests/gtests/SolverPerformanceTest.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/test/performance_tests/gtests/SolverPerformanceTest.cpp b/test/performance_tests/gtests/SolverPerformanceTest.cpp
index a0590512..46dc0b9d 100644
--- a/test/performance_tests/gtests/SolverPerformanceTest.cpp
+++ b/test/performance_tests/gtests/SolverPerformanceTest.cpp
@@ -107,8 +107,9 @@ class SolverPerformanceTest : public SolverTest {
             size_t num_elem_z_next = num_elem_z.back() * refinement_factors[2];
 
             // If the number of elements is too large (greater than 36 million), stop the refinement
-            if ((num_elem_x_next) * (num_elem_y_next) * (num_elem_z_next) > 36e6) {
-                aperi::CoutP0() << "Number of elements is too large. Stopping the refinement. Number of elements: " << (num_elem_x_next) * (num_elem_y_next) * (num_elem_z_next) << std::endl;
+            size_t num_elem_next = (num_elem_x_next) * (num_elem_y_next) * (num_elem_z_next) * 6;
+            if (num_elem_next > 36e6) {
+                aperi::CoutP0() << "Number of elements is too large. Stopping the refinement. Number of elements: " << num_elem_next << std::endl;
                 num_refinements = i;
                 break;
             }
@@ -238,11 +239,11 @@ class SolverPerformanceTest : public SolverTest {
         double runtime = 20.0;
 
         // Number of refinements and refinement factor
-        size_t num_refinements = 10;
+        size_t num_refinements = using_gpu ? 13 : 10;
         std::vector<size_t> refinement_factors = {2, 1, 1};
 
         // Initial mesh size
-        size_t initial_num_elem_x = std::pow(2, m_num_procs - 1);
+        size_t initial_num_elem_x = 1;
         size_t initial_num_elem_y = 32;
         size_t initial_num_elem_z = 256;
 

From 4771e068a04a641ffefff8b03ed390efdc0dac84 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Sun, 6 Oct 2024 22:10:54 +0000
Subject: [PATCH 06/13] add docker files for h100 gpu

---
 Dockerfile_Nvidia_H100_GPU         | 194 +++++++++++++++++++++++++++++
 docker-compose_nvidia_h100_gpu.yml |  29 +++++
 2 files changed, 223 insertions(+)
 create mode 100644 Dockerfile_Nvidia_H100_GPU
 create mode 100644 docker-compose_nvidia_h100_gpu.yml

diff --git a/Dockerfile_Nvidia_H100_GPU b/Dockerfile_Nvidia_H100_GPU
new file mode 100644
index 00000000..76e8360f
--- /dev/null
+++ b/Dockerfile_Nvidia_H100_GPU
@@ -0,0 +1,194 @@
+# Dockerfile for building a GPU-enabled environment for the aperi-mech project
+# CUDA Architecture: 90 (Hopper H100)
+# This Dockerfile sets up a python and spack environment with necessary packages
+# After building the image, the user can run the container and start working on the aperi-mech project.
+# Building the project:
+#    1. This assume the user has the aperi-mech project cloned in the same directory as the Dockerfile
+#    2. Install prerequisites:
+#        - Docker
+#        - Nvidia drivers:
+#             sudo apt-get update
+#             sudo apt-get install -y ubuntu-drivers-common
+#             sudo ubuntu-drivers autoinstall
+#             nvidia-smi # Verify the driver installation, probably need to restart the system after the last command
+#        - Nvidia Container Toolkit:
+#              # Add the package repositories
+#             distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+#             curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+#             curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+#
+#             # Update the package lists
+#             sudo apt-get update
+#
+#             # Install the NVIDIA Container Toolkit
+#             sudo apt-get install -y nvidia-docker2
+#
+#             # Restart the Docker daemon to complete the installation
+#             sudo systemctl restart docker
+#             
+#             # Add the user to the docker group, log out and log back in and verify that the user is in the docker group
+#             sudo usermod -aG docker $USER
+#             sudo reboot
+#             groups # check that the user is in the docker group
+#
+#    3. Build the docker image using the following command:
+#        docker build -t aperi-mech-gpu:latest -f Dockerfile_Nvidia_H100_GPU . 2>&1 | tee build.log
+#    4. Run the docker container using the following command (uses the docker-compose.yml file in the aperi-mech project):
+#        docker-compose -f docker-compose_nvidia_h100_gpu.yml run --service-ports aperi-mech-gpu-development /bin/bash
+#        # May need to install docker-compose using the following command:
+#        sudo apt install docker-compose
+#        Note: It is important that the drivers are the same in the image and the host system. If the there are problems with the drivers, the container will not be able to access the GPU.
+#        It is quicker to reinstall the drivers on the host system than to rebuild the image. Try the following commands:
+#            sudo apt-get update
+#            sudo apt-get upgrade
+#            sudo reboot
+#            sudo apt-get install linux-headers-$(uname -r)
+#            sudo apt-get purge nvidia*
+#            sudo apt-get install nvidia-driver-XXX # (XXX is the version number, make sure the full version number is the same as the one in the image, checked via nvidia-smi)
+#            sudo reboot
+#            # Reinstall the Nvidia Container Toolkit per the instructions above
+#    5. Start working on the aperi-mech project
+#       - Configure the project
+#          ./do_configure --gpu
+#       - Build the project:
+#          cd build/Release_gpu
+#          make -j 4
+#       - Run the project unit tests:
+#          make run_all_unit_tests
+#       - Run the project regression tests:
+#          TODO(jake) implement: make run_all_regression_tests
+#       - Run the project performance tests:
+#          TODO(jake) implement: make run_all_performance_tests
+
+# Base image with CUDA support
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set CUDA architecture, 90 is for the Hopper H100
+ENV CUDA_ARCH=90
+
+#################### System Packages from apt ####################
+# Install necessary packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    coreutils \
+    curl \
+    environment-modules \
+    file \
+    gfortran \
+    git \
+    git-lfs \
+    gpg \
+    lcov \
+    libcurl4-openssl-dev \
+    libgl1 \
+    libglu1-mesa \
+    libssl-dev \
+    lsb-release \
+    openssl \
+    python3 \
+    python3-distutils \
+    python3-venv \
+    python3-pip \
+    sudo \
+    unzip \
+    vim \
+    xorg \
+    zip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install NVIDIA utilities (nvidia-smi)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    nvidia-utils-550
+
+#################### User Setup ####################
+# Create a non-root user
+RUN useradd -m aperi-mech_docker
+
+# Switch back to root user
+USER root
+
+# Configure passwordless sudo for the user
+RUN echo "aperi-mech_docker ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+# Change to the new user
+USER aperi-mech_docker
+
+# Make the working directory
+WORKDIR /home/aperi-mech_docker
+
+# Set the HOME environment variable
+ENV HOME=/home/aperi-mech_docker
+
+#################### Python Packages ####################
+# Install python packages using pip
+RUN pip3 install --no-input --no-cache-dir \
+    pytest==8.3.2 \
+    testbook==0.4.2 \
+    jupyter==1.0.0 \
+    jupyterlab==4.2.4 \
+    numpy==2.0.1 \
+    scipy==1.14.0 \
+    matplotlib==3.9.1 \
+    ipykernel==6.29.5 \
+    meshio==5.3.5 \
+    gmsh==4.13.1 \
+    netCDF4==1.7.1.post1 \
+    && rm -rf ~/.cache/pip
+
+# Add environment to Jupyter
+RUN python3 -m ipykernel install --user --name aperi-mech --display-name "aperi-mech"
+
+# Add .local/bin to the PATH
+ENV PATH="${HOME}/.local/bin:$PATH"
+
+#################### Spack Installation and Setup ####################
+# Clone Spack repo
+RUN git clone -c feature.manyFiles=true https://github.com/spack/spack.git ${HOME}/spack
+
+# Set up Spack environment
+ENV SPACK_ROOT=${HOME}/spack
+ENV PATH="$SPACK_ROOT/bin:$PATH"
+RUN . $SPACK_ROOT/share/spack/setup-env.sh
+
+# Find compilers and externals for Spack
+RUN spack compiler find && \
+    spack external find
+
+# Create a new Spack environment, cpu build
+RUN spack env create aperi-mech
+
+# Add packages to the Spack environment, aperi-mech
+RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
+    spack -e aperi-mech add compadre@master ~tests && \
+    spack -e aperi-mech add kokkos-kernels@4.3.01 +cuda ~shared cuda_arch=${CUDA_ARCH} && \
+    spack -e aperi-mech add kokkos@4.3.01 +cuda +cuda_lambda +cuda_relocatable_device_code ~cuda_uvm ~shared +wrapper cxxstd=17 cuda_arch=${CUDA_ARCH} && \
+    spack -e aperi-mech add trilinos@16.0.0 ~amesos ~amesos2 ~anasazi ~aztec ~belos ~epetra ~epetraext ~ifpack ~ifpack2 ~ml ~muelu ~sacado ~shared +cuda +cuda_rdc +exodus +gtest +hdf5 +stk +zoltan +zoltan2 cxxstd=17 cuda_arch=${CUDA_ARCH} && \
+    spack -e aperi-mech add googletest@1.14.0 && \
+    spack -e aperi-mech add yaml-cpp@0.7.0 && \
+    spack -e aperi-mech add eigen@3.4.0
+
+# Install Packages, aperi-mech
+RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
+    spack -e aperi-mech install --fresh
+
+# Create a new Spack environment, for seacas. Want seacas without mpi, which causes conflicts with trilinos
+RUN spack env create seacas
+
+# Add packages to the Spack environment, seacas
+RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
+    spack -e seacas add openmpi && \
+    spack -e seacas add seacas ~mpi ~tests ~x11
+
+# Install Packages, seacas
+RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
+    spack -e seacas install --fresh
+
+# Add the spack source command to the bashrc
+RUN echo "source ${SPACK_ROOT}/share/spack/setup-env.sh" >> ${HOME}/.bashrc
+
+# HEALTHCHECK to verify Spack and Python availability
+HEALTHCHECK --interval=1m --timeout=10s --start-period=5s --retries=3 CMD /bin/bash -c "source ${SPACK_ROOT}/share/spack/setup-env.sh && python3 --version || exit 1"
\ No newline at end of file
diff --git a/docker-compose_nvidia_h100_gpu.yml b/docker-compose_nvidia_h100_gpu.yml
new file mode 100644
index 00000000..640d6708
--- /dev/null
+++ b/docker-compose_nvidia_h100_gpu.yml
@@ -0,0 +1,29 @@
+# The main purpose of this container is to run a development environment for
+# the Aperi-Mech project. It is used in CI/CD pipelines and for local development.
+# To start the container, run the following command:
+#   docker-compose -f docker-compose-nvidia.yml run --service-ports aperi-mech-gpu-development /bin/bash
+# This will start the container and open a bash shell.
+
+version: "3.8"
+
+services:
+  aperi-mech-gpu-development:
+    image: aperi-mech-gpu:latest
+    network_mode: host
+    build:
+      context: .
+      dockerfile: Dockerfile_Nvidia_H100_GPU
+    volumes:
+      - .:/home/aperi-mech_docker/aperi-mech
+    working_dir: /home/aperi-mech_docker/aperi-mech
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility

From ddcd09d9cbd15171a29447643ee0797d798c8642 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Mon, 7 Oct 2024 12:16:14 +0000
Subject: [PATCH 07/13] fix incorrect namespace

---
 test/performance_tests/gtests/SolverPerformanceTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/performance_tests/gtests/SolverPerformanceTest.cpp b/test/performance_tests/gtests/SolverPerformanceTest.cpp
index 46dc0b9d..0927bd58 100644
--- a/test/performance_tests/gtests/SolverPerformanceTest.cpp
+++ b/test/performance_tests/gtests/SolverPerformanceTest.cpp
@@ -229,7 +229,7 @@ class SolverPerformanceTest : public SolverTest {
         StopCapturingOutput();
 #ifndef NDEBUG
         // Skip this test in debug mode
-        aperture::CoutP0() << "Skipping scaling test in debug mode. If this is needed then the timing estimates should be adjusted." << std::endl;
+        aperi::CoutP0() << "Skipping scaling test in debug mode. If this is needed then the timing estimates should be adjusted." << std::endl;
         return;
 #endif
 

From 72ee981633fcfb258587a9d52aca9ff6ea475ef4 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Tue, 8 Oct 2024 17:41:00 +0000
Subject: [PATCH 08/13] improve consistency in style between SolverTestFixture
 and Solver

---
 test/unit_tests/SolverTestFixture.h | 38 ++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/test/unit_tests/SolverTestFixture.h b/test/unit_tests/SolverTestFixture.h
index 7a67f76b..36845525 100644
--- a/test/unit_tests/SolverTestFixture.h
+++ b/test/unit_tests/SolverTestFixture.h
@@ -1,6 +1,7 @@
 #pragma once
 #include <gtest/gtest.h>
 
+#include <chrono>
 #include <memory>
 #include <string>
 
@@ -13,6 +14,7 @@
 #include "InternalForceContribution.h"
 #include "IoInputFile.h"
 #include "IoMesh.h"
+#include "LogUtils.h"
 #include "Material.h"
 #include "MeshLabeler.h"
 #include "Preprocessor.h"
@@ -20,6 +22,7 @@
 #include "Solver.h"
 #include "TimeStepper.h"
 #include "UnitTestUtils.h"
+#include "YamlUtils.h"
 
 // Solver test fixture
 class SolverTest : public ApplicationTest {
@@ -30,6 +33,11 @@ class SolverTest : public ApplicationTest {
     }
 
     double RunSolver() {
+        aperi::CoutP0() << "############################################" << std::endl;
+        aperi::CoutP0() << "Starting Application" << std::endl;
+        aperi::CoutP0() << " - Reading Input File and Mesh" << std::endl;
+        auto start_mesh_read = std::chrono::high_resolution_clock::now();
+        // TODO(jake): hard coding to 1 procedure for now. Fix this when we have multiple procedures.
         int procedure_id = 0;
 
         // Create an IO input file object and read the input file
@@ -57,12 +65,18 @@ class SolverTest : public ApplicationTest {
 
         // Read the mesh
         m_io_mesh->ReadMesh(m_io_input_file->GetMeshFile(procedure_id), part_names);
+        auto end_mesh_read = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "   Finished Reading Input and Mesh. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_mesh_read - start_mesh_read).count() << " ms" << std::endl;
+        aperi::CoutP0() << " - Setting up for the Solver" << std::endl;
+        auto start_solver_setup = std::chrono::high_resolution_clock::now();
 
         bool uses_generalized_fields = false;
 
         // Loop over parts, create materials, and add parts to force contributions
+        aperi::CoutP0() << "   - Adding parts to force contributions: " << std::endl;
         for (const auto& part : parts) {
             // Create InternalForceContributionParameters
+            aperi::CoutP0() << "      " << part["set"].as<std::string>() << std::endl;
             aperi::InternalForceContributionParameters internal_force_contribution_parameters(part, m_io_input_file, m_io_mesh->GetMeshData());
             m_internal_force_contributions.push_back(aperi::CreateInternalForceContribution(internal_force_contribution_parameters));
             uses_generalized_fields = internal_force_contribution_parameters.approximation_space_parameters->UsesGeneralizedFields() || uses_generalized_fields;
@@ -78,23 +92,38 @@ class SolverTest : public ApplicationTest {
         field_data.insert(field_data.end(), mesh_labeler_field_data.begin(), mesh_labeler_field_data.end());
 
         // Add fields to the mesh and complete initialization
+        aperi::CoutP0() << "   - Adding fields to the mesh and completing initialization" << std::endl;
+        auto start_complete_initialization = std::chrono::high_resolution_clock::now();
         m_io_mesh->AddFields(field_data);
         m_io_mesh->CompleteInitialization();
+        auto end_complete_initialization = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "     Finished adding fields to the mesh and completing initialization. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_complete_initialization - start_complete_initialization).count() << " ms" << std::endl;
 
         // Label the mesh
+        aperi::CoutP0() << "   - Labeling the mesh" << std::endl;
+        auto start_labeling = std::chrono::high_resolution_clock::now();
         for (const auto& part : parts) {
             aperi::MeshLabelerParameters mesh_labeler_parameters(part, m_io_mesh->GetMeshData());
             mesh_labeler->LabelPart(mesh_labeler_parameters);
         }
+        auto end_labeling = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "     Finished labeling the mesh. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_labeling - start_labeling).count() << " ms" << std::endl;
 
         // Create the field results file
+        aperi::CoutP0() << "   - Creating the field results file" << std::endl;
+        auto start_field_results_file = std::chrono::high_resolution_clock::now();
         m_io_mesh->CreateFieldResultsFile(m_io_input_file->GetOutputFile(procedure_id), field_data);
+        auto end_field_results_file = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "     Finished creating the field results file. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_field_results_file - start_field_results_file).count() << " ms" << std::endl;
 
         // Get loads
         std::vector<YAML::Node> loads = m_io_input_file->GetLoads(procedure_id);
 
         // Loop over loads and add them to force contributions
+        aperi::CoutP0() << "   - Adding loads to force contributions: " << std::endl;
         for (auto load : loads) {
+            auto name = load.begin()->first.as<std::string>();
+            aperi::CoutP0() << "     " << name << std::endl;
             m_external_force_contributions.push_back(aperi::CreateExternalForceContribution(load, m_io_mesh->GetMeshData()));
         }
 
@@ -106,9 +135,10 @@ class SolverTest : public ApplicationTest {
         std::vector<YAML::Node> boundary_conditions = m_io_input_file->GetBoundaryConditions(procedure_id);
 
         // Loop over boundary conditions and add them to the vector of boundary conditions
+        aperi::CoutP0() << "   - Adding boundary conditions: " << std::endl;
         for (auto boundary_condition : boundary_conditions) {
             std::string name = boundary_condition.begin()->first.as<std::string>();
-            std::cout << "Adding boundary condition " << name << " to boundary conditions" << std::endl;
+            aperi::CoutP0() << "      " << name << std::endl;
             m_boundary_conditions.push_back(aperi::CreateBoundaryCondition(boundary_condition, m_io_mesh->GetMeshData()));
         }
 
@@ -123,9 +153,15 @@ class SolverTest : public ApplicationTest {
 
         // Create solver
         m_solver = aperi::CreateSolver(m_io_mesh, m_internal_force_contributions, m_external_force_contributions, m_boundary_conditions, time_stepper, output_scheduler);
+        auto end_solver_setup = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "   Finished Setting up for the Solver. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_solver_setup - start_solver_setup).count() << " ms" << std::endl;
 
         // Run solver
+        aperi::CoutP0() << " - Starting Solver" << std::endl;
+        auto start_solver = std::chrono::high_resolution_clock::now();
         double time = m_solver->Solve();
+        auto end_solver = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "   Finished Solver. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_solver - start_solver).count() << " ms" << std::endl;
 
         // Finalize the IO mesh
         MPI_Barrier(m_comm);

From 116d83e7b5cf6ef7299a4a47a2fcf50711340ab6 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Tue, 8 Oct 2024 17:41:33 +0000
Subject: [PATCH 09/13] Fix strange compiler-related issue with a functor

---
 include/ElementReproducingKernel.h | 33 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/include/ElementReproducingKernel.h b/include/ElementReproducingKernel.h
index b6ec3337..1fb9693e 100644
--- a/include/ElementReproducingKernel.h
+++ b/include/ElementReproducingKernel.h
@@ -7,6 +7,7 @@
 #include <string>
 #include <vector>
 
+#include "Constants.h"
 #include "ComputeInternalForceFunctors.h"
 #include "ElementBase.h"
 #include "ElementProcessor.h"
@@ -65,30 +66,28 @@ class ElementReproducingKernel : public ElementBase {
         search_processor.PrintNumNeighborsStats();
     }
 
+    // TODO(jake): Get rid of this wrapper class. It is only here because of some strange compiling issues that lead to a segfault.
+    // Using a wrapper class seems to fix the issue.
+    // Using ShapeFunctionsFunctorReproducingKernel directly in the compute_and_store_function_values function causes a segfault on the GPU in Release mode,
+    // but works fine in Debug mode or on the CPU. Spent a lot of time trying to figure out why, but couldn't find the issue.
+    template <size_t MaxNumNeighbors>
+    struct FunctionFunctorWrapper {
+        KOKKOS_INLINE_FUNCTION Eigen::Matrix<double, MaxNumNeighbors, 1> Values(const Eigen::Matrix<double, MaxNumNeighbors, 1> &kernel_values, const Eigen::Matrix<double, MaxNumNeighbors, 3> &shifted_neighbor_coordinates, size_t actual_num_neighbors) const {
+            return compute_node_functions_functor.Values(kernel_values, shifted_neighbor_coordinates, actual_num_neighbors);
+        }
+        aperi::ShapeFunctionsFunctorReproducingKernel<MaxNumNeighbors> compute_node_functions_functor;
+    };
+
     void ComputeAndStoreFunctionValues() {
         aperi::CoutP0() << "   - Computing and storing function values" << std::endl;
         auto start_function_values = std::chrono::high_resolution_clock::now();
-        // Functor for computing shape function values at nodes
-        size_t compute_node_functions_functor_size = sizeof(ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS>);
-        auto compute_node_functions_functor = (ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS> *)Kokkos::kokkos_malloc(compute_node_functions_functor_size);
-        assert(compute_node_functions_functor != nullptr);
 
-        // Initialize the functor
-        Kokkos::parallel_for(
-            "CreateReproducingKernelFunctors", 1, KOKKOS_LAMBDA(const int &) {
-                new ((ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS> *)compute_node_functions_functor) ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS>();
-            });
+        // Create an instance of the functor
+        FunctionFunctorWrapper <MAX_NODE_NUM_NEIGHBORS> compute_node_functions_functor;
 
         aperi::FunctionValueStorageProcessor function_value_storage_processor(m_mesh_data, m_part_names);
-        function_value_storage_processor.compute_and_store_function_values<MAX_NODE_NUM_NEIGHBORS>(*compute_node_functions_functor);
-
-        // Destroy the functor
-        Kokkos::parallel_for(
-            "DestroyReproducingKernelFunctors", 1, KOKKOS_LAMBDA(const int &) {
-                compute_node_functions_functor->~ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS>();
-            });
+        function_value_storage_processor.compute_and_store_function_values<MAX_NODE_NUM_NEIGHBORS>(compute_node_functions_functor);
 
-        Kokkos::kokkos_free(compute_node_functions_functor);
         auto end_function_values = std::chrono::high_resolution_clock::now();
         aperi::CoutP0() << "     Finished Computing and Storing Function Values. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_function_values - start_function_values).count() << " ms" << std::endl;
     }

From 567bd7171dac50d94493d63b2c0e812fd24299ba Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Tue, 8 Oct 2024 18:05:44 +0000
Subject: [PATCH 10/13] fix date printing

---
 src/main.cpp | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index bd837f51..177b0b23 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,8 +1,10 @@
 #include <mpi.h>
 
 #include <Kokkos_Core.hpp>
+#include <chrono>
+#include <ctime>
+#include <iomanip>
 #include <iostream>
-// #include <mfem.hpp>
 
 #include "Application.h"
 #include "LogUtils.h"
@@ -65,10 +67,12 @@ int main(int argc, char* argv[]) {
 
     // Print header and number of processes
     PrintHeader();
+    auto start_time = std::chrono::system_clock::now();
+    std::time_t start_time_t = std::chrono::system_clock::to_time_t(start_time);
+    std::tm start_tm = *std::localtime(&start_time_t);
+
     aperi::CoutP0() << "Running on " << size << " processes." << std::endl;
-    aperi::CoutP0() << "Date: " << __DATE__ << std::endl;
-    aperi::CoutP0() << "Time: " << __TIME__ << std::endl
-                    << std::endl;
+    aperi::CoutP0() << "Started at: " << std::put_time(&start_tm, "%Y-%m-%d %H:%M:%S") << std::endl;
 
     // Check if input filename is provided as a command-line argument
     if (argc < 2) {
@@ -83,10 +87,15 @@ int main(int argc, char* argv[]) {
     // Run the application
     RunApplication(input_filename, p_comm);
 
+    // Print footer
+    auto end_time = std::chrono::system_clock::now();
+    std::time_t end_time_t = std::chrono::system_clock::to_time_t(end_time);
+    std::tm end_tm = *std::localtime(&end_time_t);
+    std::chrono::duration<double> total_time = end_time - start_time;
+
     aperi::CoutP0() << "aperi-mech finished successfully!" << std::endl;
-    aperi::CoutP0() << "Date: " << __DATE__ << std::endl;
-    aperi::CoutP0() << "Time: " << __TIME__ << std::endl
-                    << std::endl;
+    aperi::CoutP0() << "Finished at: " << std::put_time(&end_tm, "%Y-%m-%d %H:%M:%S") << std::endl;
+    aperi::CoutP0() << "Total time: " << std::scientific << std::setprecision(2) << total_time.count() << " seconds" << std::endl;
     aperi::CoutP0() << "############################################" << std::endl;
 
     // Finalize Kokkos and MPI

From 1878ccf04da509ffb53aa6159eda166e811c34d6 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Tue, 8 Oct 2024 18:26:21 +0000
Subject: [PATCH 11/13] fix json performance writing so it will work in
 parallel

---
 .../gtests/SolverPerformanceTest.cpp          | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/test/performance_tests/gtests/SolverPerformanceTest.cpp b/test/performance_tests/gtests/SolverPerformanceTest.cpp
index 0927bd58..6faeec5a 100644
--- a/test/performance_tests/gtests/SolverPerformanceTest.cpp
+++ b/test/performance_tests/gtests/SolverPerformanceTest.cpp
@@ -66,6 +66,7 @@ class SolverPerformanceTest : public SolverTest {
     void SetUp() override {
         // Run SolverTest::SetUp first
         SolverTest::SetUp();
+        MPI_Comm_rank(MPI_COMM_WORLD, &m_rank);
     }
 
     void TearDown() override {
@@ -143,13 +144,7 @@ class SolverPerformanceTest : public SolverTest {
         }
 
         // Create a json file with the benchmark results
-        std::string test_suite_name = ::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name();
-        std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
-        std::string full_test_name = test_suite_name + "_" + test_name;
-
-        std::string run_specs = "_" + mode + "_procs_" + std::to_string(m_num_procs) + (using_gpu ? "_gpu" : "_cpu");
-        std::ofstream json_file(json_prefix + "_gtest_" + full_test_name + run_specs + ".json");
-        json_file << "[" << std::endl;
+        std::string json_string = "[\n";
 
         for (size_t i = 0; i < num_refinements; ++i) {
             // Create the next refinement
@@ -163,28 +158,37 @@ class SolverPerformanceTest : public SolverTest {
             // Print the results
             aperi::CoutP0() << "  Runtime (s) / increment:      " << std::scientific << average_increment_runtime << std::endl;
 
-            // Output the results to a json file
-            if (i != 0) {
-                json_file << "  },";  // close the previous benchmark
-            }
-            json_file << "  {" << std::endl;
             // Name of the benchmark: Taylor Impact: m_num_procs processors, cpu/gpu, num_elem_x x num_elem_y x num_elem_z elements, runtime per increment"
             std::string name = "Taylor Impact: " + std::to_string(m_num_procs) + " processors, " + (using_gpu ? "gpu" : "cpu") + ", " + std::to_string(num_elem_x[i]) + " x " + std::to_string(num_elem_y[i]) + " x " + std::to_string(num_elem_z[i]) + " elements, runtime per increment";
-            json_file << R"(    "name": ")" << name << R"(",)" << std::endl;
             // Unit of the benchmark
             std::string unit = "milliseconds";
-            json_file << R"(    "unit": ")" << unit << R"(",)" << std::endl;
             // Value of the benchmark
             double value = average_increment_runtime * 1000.0;
-            json_file << R"(    "value": )" << value << std::endl;
+
+            // Output the results to a json file
+            if (i != 0) {
+                json_string += "  },";
+            }
+            json_string += "  {\n";
+            json_string += R"(    "name": ")" + name + R"(",)" + "\n";
+            json_string += R"(    "unit": ")" + unit + R"(",)" + "\n";
+            json_string += R"(    "value": )" + std::to_string(value) + "\n";
 
             // Setup for the next refinement
             ResetSolverTest();
         }
 
-        json_file << "  }" << std::endl;  // close the last benchmark
-        json_file << "]" << std::endl;    // close the json file
-        json_file.close();
+        json_string += "  }\n]";
+        if (m_rank == 0) {
+            std::string test_suite_name = ::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name();
+            std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name();
+            std::string full_test_name = test_suite_name + "_" + test_name;
+            std::string run_specs = "_" + mode + "_procs_" + std::to_string(m_num_procs) + (using_gpu ? "_gpu" : "_cpu");
+
+            std::ofstream json_file(json_prefix + "_gtest_" + full_test_name + run_specs + ".json");
+            json_file << json_string;
+            json_file.close();
+        }
     }
 
     void RunStandardPerformanceTest(bool reproducing_kernel = false) {
@@ -263,6 +267,8 @@ class SolverPerformanceTest : public SolverTest {
 
         RunTaylorImpactTest(reproducing_kernel, runtime, num_refinements, refinement_factors, initial_num_elem_x, initial_num_elem_y, initial_num_elem_z, using_gpu, mode, gold_runtimes, "scaling");
     }
+
+    int m_rank;
 };
 
 // Test a large, square cross section, taylor impact test, FEM

From b976e17fa91c2c9ee219d2a29aabd832c5f6eb6e Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Tue, 8 Oct 2024 13:47:41 -0600
Subject: [PATCH 12/13] bump up memory golds due to switch to 8 bit integers

---
 test/regression_tests/taylor_bar/fem/test.yaml                  | 2 +-
 test/regression_tests/taylor_bar/fem_strain_smoothing/test.yaml | 2 +-
 test/regression_tests/taylor_bar/rkpm/test.yaml                 | 2 +-
 test/regression_tests/taylor_bar/rkpm_hex/test.yaml             | 2 +-
 test/regression_tests/taylor_bar/rkpm_one_pass/test.yaml        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/regression_tests/taylor_bar/fem/test.yaml b/test/regression_tests/taylor_bar/fem/test.yaml
index a065bb97..b1eb42bd 100644
--- a/test/regression_tests/taylor_bar/fem/test.yaml
+++ b/test/regression_tests/taylor_bar/fem/test.yaml
@@ -15,7 +15,7 @@ tests:
   - <<: *defaults
     hardware: cpu
     peak_memory_check:
-      value: 46.0 # Gold peak memory usage in MB
+      value: 56.0 # Gold peak memory usage in MB
       percent_tolerance: 10
   - <<: *defaults
     hardware: gpu
diff --git a/test/regression_tests/taylor_bar/fem_strain_smoothing/test.yaml b/test/regression_tests/taylor_bar/fem_strain_smoothing/test.yaml
index ec654f12..8bc5849a 100644
--- a/test/regression_tests/taylor_bar/fem_strain_smoothing/test.yaml
+++ b/test/regression_tests/taylor_bar/fem_strain_smoothing/test.yaml
@@ -15,7 +15,7 @@ tests:
   - <<: *defaults
     hardware: cpu
     peak_memory_check:
-      value: 46.0 # Gold peak memory usage in MB
+      value: 56.0 # Gold peak memory usage in MB
       percent_tolerance: 10
   - <<: *defaults
     hardware: gpu
diff --git a/test/regression_tests/taylor_bar/rkpm/test.yaml b/test/regression_tests/taylor_bar/rkpm/test.yaml
index ec654f12..8bc5849a 100644
--- a/test/regression_tests/taylor_bar/rkpm/test.yaml
+++ b/test/regression_tests/taylor_bar/rkpm/test.yaml
@@ -15,7 +15,7 @@ tests:
   - <<: *defaults
     hardware: cpu
     peak_memory_check:
-      value: 46.0 # Gold peak memory usage in MB
+      value: 56.0 # Gold peak memory usage in MB
       percent_tolerance: 10
   - <<: *defaults
     hardware: gpu
diff --git a/test/regression_tests/taylor_bar/rkpm_hex/test.yaml b/test/regression_tests/taylor_bar/rkpm_hex/test.yaml
index ec654f12..8bc5849a 100644
--- a/test/regression_tests/taylor_bar/rkpm_hex/test.yaml
+++ b/test/regression_tests/taylor_bar/rkpm_hex/test.yaml
@@ -15,7 +15,7 @@ tests:
   - <<: *defaults
     hardware: cpu
     peak_memory_check:
-      value: 46.0 # Gold peak memory usage in MB
+      value: 56.0 # Gold peak memory usage in MB
       percent_tolerance: 10
   - <<: *defaults
     hardware: gpu
diff --git a/test/regression_tests/taylor_bar/rkpm_one_pass/test.yaml b/test/regression_tests/taylor_bar/rkpm_one_pass/test.yaml
index 9f8451e6..148fef97 100644
--- a/test/regression_tests/taylor_bar/rkpm_one_pass/test.yaml
+++ b/test/regression_tests/taylor_bar/rkpm_one_pass/test.yaml
@@ -15,7 +15,7 @@ tests:
   - <<: *defaults
     hardware: cpu
     peak_memory_check:
-      value: 46.0 # Gold peak memory usage in MB
+      value: 56.0 # Gold peak memory usage in MB
       percent_tolerance: 10
   - <<: *defaults
     hardware: gpu

From 5facbcd8ee23244959dafbf163fbdb6840fbdd31 Mon Sep 17 00:00:00 2001
From: Jake Koester <jake.koester@apericmc.com>
Date: Tue, 8 Oct 2024 15:27:19 -0600
Subject: [PATCH 13/13] update memory golds in protego

---
 protego-mech | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protego-mech b/protego-mech
index e07d0f18..261e051c 160000
--- a/protego-mech
+++ b/protego-mech
@@ -1 +1 @@
-Subproject commit e07d0f18f3504089eeb5313df1e23d3e00aa3dbb
+Subproject commit 261e051cc7f0c0c98bfff4224154eb300dadcd62