aperijake · aperijake · Oct 8, 2024 · Oct 2, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/Dockerfile_Nvidia_H100_GPU b/Dockerfile_Nvidia_H100_GPU
@@ -0,0 +1,194 @@
+# Dockerfile for building a GPU-enabled environment for the aperi-mech project
+# CUDA Architecture: 90 (Hopper H100)
+# This Dockerfile sets up a python and spack environment with necessary packages
+# After building the image, the user can run the container and start working on the aperi-mech project.
+# Building the project:
+#    1. This assume the user has the aperi-mech project cloned in the same directory as the Dockerfile
+#    2. Install prerequisites:
+#        - Docker
+#        - Nvidia drivers:
+#             sudo apt-get update
+#             sudo apt-get install -y ubuntu-drivers-common
+#             sudo ubuntu-drivers autoinstall
+#             nvidia-smi # Verify the driver installation, probably need to restart the system after the last command
+#        - Nvidia Container Toolkit:
+#              # Add the package repositories
+#             distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+#             curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+#             curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+#
+#             # Update the package lists
+#             sudo apt-get update
+#
+#             # Install the NVIDIA Container Toolkit
+#             sudo apt-get install -y nvidia-docker2
+#
+#             # Restart the Docker daemon to complete the installation
+#             sudo systemctl restart docker
+#             
+#             # Add the user to the docker group, log out and log back in and verify that the user is in the docker group
+#             sudo usermod -aG docker $USER
+#             sudo reboot
+#             groups # check that the user is in the docker group
+#
+#    3. Build the docker image using the following command:
+#        docker build -t aperi-mech-gpu:latest -f Dockerfile_Nvidia_H100_GPU . 2>&1 | tee build.log
+#    4. Run the docker container using the following command (uses the docker-compose.yml file in the aperi-mech project):
+#        docker-compose -f docker-compose_nvidia_h100_gpu.yml run --service-ports aperi-mech-gpu-development /bin/bash
+#        # May need to install docker-compose using the following command:
+#        sudo apt install docker-compose
+#        Note: It is important that the drivers are the same in the image and the host system. If the there are problems with the drivers, the container will not be able to access the GPU.
+#        It is quicker to reinstall the drivers on the host system than to rebuild the image. Try the following commands:
+#            sudo apt-get update
+#            sudo apt-get upgrade
+#            sudo reboot
+#            sudo apt-get install linux-headers-$(uname -r)
+#            sudo apt-get purge nvidia*
+#            sudo apt-get install nvidia-driver-XXX # (XXX is the version number, make sure the full version number is the same as the one in the image, checked via nvidia-smi)
+#            sudo reboot
+#            # Reinstall the Nvidia Container Toolkit per the instructions above
+#    5. Start working on the aperi-mech project
+#       - Configure the project
+#          ./do_configure --gpu
+#       - Build the project:
+#          cd build/Release_gpu
+#          make -j 4
+#       - Run the project unit tests:
+#          make run_all_unit_tests
+#       - Run the project regression tests:
+#          TODO(jake) implement: make run_all_regression_tests
+#       - Run the project performance tests:
+#          TODO(jake) implement: make run_all_performance_tests
+
+# Base image with CUDA support
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04
+
+# Avoid prompts from apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set CUDA architecture, 90 is for the Hopper H100
+ENV CUDA_ARCH=90
+
+#################### System Packages from apt ####################
+# Install necessary packages
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    ca-certificates \
+    coreutils \
+    curl \
+    environment-modules \
+    file \
+    gfortran \
+    git \
+    git-lfs \
+    gpg \
+    lcov \
+    libcurl4-openssl-dev \
+    libgl1 \
+    libglu1-mesa \
+    libssl-dev \
+    lsb-release \
+    openssl \
+    python3 \
+    python3-distutils \
+    python3-venv \
+    python3-pip \
+    sudo \
+    unzip \
+    vim \
+    xorg \
+    zip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install NVIDIA utilities (nvidia-smi)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    nvidia-utils-550
+
+#################### User Setup ####################
+# Create a non-root user
+RUN useradd -m aperi-mech_docker
+
+# Switch back to root user
+USER root
+
+# Configure passwordless sudo for the user
+RUN echo "aperi-mech_docker ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+# Change to the new user
+USER aperi-mech_docker
+
+# Make the working directory
+WORKDIR /home/aperi-mech_docker
+
+# Set the HOME environment variable
+ENV HOME=/home/aperi-mech_docker
+
+#################### Python Packages ####################
+# Install python packages using pip
+RUN pip3 install --no-input --no-cache-dir \
+    pytest==8.3.2 \
+    testbook==0.4.2 \
+    jupyter==1.0.0 \
+    jupyterlab==4.2.4 \
+    numpy==2.0.1 \
+    scipy==1.14.0 \
+    matplotlib==3.9.1 \
+    ipykernel==6.29.5 \
+    meshio==5.3.5 \
+    gmsh==4.13.1 \
+    netCDF4==1.7.1.post1 \
+    && rm -rf ~/.cache/pip
+
+# Add environment to Jupyter
+RUN python3 -m ipykernel install --user --name aperi-mech --display-name "aperi-mech"
+
+# Add .local/bin to the PATH
+ENV PATH="${HOME}/.local/bin:$PATH"
+
+#################### Spack Installation and Setup ####################
+# Clone Spack repo
+RUN git clone -c feature.manyFiles=true https://github.com/spack/spack.git ${HOME}/spack
+
+# Set up Spack environment
+ENV SPACK_ROOT=${HOME}/spack
+ENV PATH="$SPACK_ROOT/bin:$PATH"
+RUN . $SPACK_ROOT/share/spack/setup-env.sh
+
+# Find compilers and externals for Spack
+RUN spack compiler find && \
+    spack external find
+
+# Create a new Spack environment, cpu build
+RUN spack env create aperi-mech
+
+# Add packages to the Spack environment, aperi-mech
+RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
+    spack -e aperi-mech add compadre@master ~tests && \
+    spack -e aperi-mech add kokkos-kernels@4.3.01 +cuda ~shared cuda_arch=${CUDA_ARCH} && \
+    spack -e aperi-mech add kokkos@4.3.01 +cuda +cuda_lambda +cuda_relocatable_device_code ~cuda_uvm ~shared +wrapper cxxstd=17 cuda_arch=${CUDA_ARCH} && \
+    spack -e aperi-mech add trilinos@16.0.0 ~amesos ~amesos2 ~anasazi ~aztec ~belos ~epetra ~epetraext ~ifpack ~ifpack2 ~ml ~muelu ~sacado ~shared +cuda +cuda_rdc +exodus +gtest +hdf5 +stk +zoltan +zoltan2 cxxstd=17 cuda_arch=${CUDA_ARCH} && \
+    spack -e aperi-mech add googletest@1.14.0 && \
+    spack -e aperi-mech add yaml-cpp@0.7.0 && \
+    spack -e aperi-mech add eigen@3.4.0
+
+# Install Packages, aperi-mech
+RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
+    spack -e aperi-mech install --fresh
+
+# Create a new Spack environment, for seacas. Want seacas without mpi, which causes conflicts with trilinos
+RUN spack env create seacas
+
+# Add packages to the Spack environment, seacas
+RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
+    spack -e seacas add openmpi && \
+    spack -e seacas add seacas ~mpi ~tests ~x11
+
+# Install Packages, seacas
+RUN . $SPACK_ROOT/share/spack/setup-env.sh && \
+    spack -e seacas install --fresh
+
+# Add the spack source command to the bashrc
+RUN echo "source ${SPACK_ROOT}/share/spack/setup-env.sh" >> ${HOME}/.bashrc
+
+# HEALTHCHECK to verify Spack and Python availability
+HEALTHCHECK --interval=1m --timeout=10s --start-period=5s --retries=3 CMD /bin/bash -c "source ${SPACK_ROOT}/share/spack/setup-env.sh && python3 --version || exit 1"
diff --git a/docker-compose_nvidia_h100_gpu.yml b/docker-compose_nvidia_h100_gpu.yml
@@ -0,0 +1,29 @@
+# The main purpose of this container is to run a development environment for
+# the Aperi-Mech project. It is used in CI/CD pipelines and for local development.
+# To start the container, run the following command:
+#   docker-compose -f docker-compose-nvidia.yml run --service-ports aperi-mech-gpu-development /bin/bash
+# This will start the container and open a bash shell.
+
+version: "3.8"
+
+services:
+  aperi-mech-gpu-development:
+    image: aperi-mech-gpu:latest
+    network_mode: host
+    build:
+      context: .
+      dockerfile: Dockerfile_Nvidia_H100_GPU
+    volumes:
+      - .:/home/aperi-mech_docker/aperi-mech
+    working_dir: /home/aperi-mech_docker/aperi-mech
+    runtime: nvidia
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    environment:
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
diff --git a/include/ElementProcessor.h b/include/ElementProcessor.h
@@ -2,6 +2,7 @@
 
 #include <Eigen/Dense>
 #include <array>
+#include <chrono>
 #include <memory>
 #include <stk_mesh/base/BulkData.hpp>
 #include <stk_mesh/base/Field.hpp>
@@ -565,7 +566,8 @@ class StrainSmoothingProcessor {
            - The node function derivatives need to be computed.
         */
 
-        aperi::CoutP0() << "Building smoothed cell data." << std::endl;
+        aperi::CoutP0() << "   - Building Smoothed Cell Data." << std::endl;
+        auto start_time = std::chrono::high_resolution_clock::now();
 
         // Create the cells selector
         std::vector<std::string> cells_sets;
@@ -799,17 +801,21 @@ class StrainSmoothingProcessor {
             }
         }
         average_num_nodes /= static_cast<double>(num_cells);
-        aperi::CoutP0() << "Average number of points defining a cell: " << average_num_nodes << std::endl;
+        aperi::CoutP0() << "     - Average number of points defining a cell: " << average_num_nodes << std::endl;
         if (one_pass_method) {
             average_num_neighbors /= static_cast<double>(num_cells);
-            aperi::CoutP0() << "Average number of neighbors for a cell: " << average_num_neighbors << std::endl;
+            aperi::CoutP0() << "     - Average number of neighbors for a cell: " << average_num_neighbors << std::endl;
         }
         bool set_start_from_lengths = false;  // The start array is already set above. This can be done as we are on host and looping through sequentially.
         smoothed_cell_data->CompleteAddingCellNodeIndicesOnHost(set_start_from_lengths);
         smoothed_cell_data->CopyCellNodeViewsToDevice();
 
         assert(CheckPartitionOfNullity(smoothed_cell_data));
 
+        auto end_time = std::chrono::high_resolution_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
+        aperi::CoutP0() << "     Finished building Smoothed Cell Data. Time: " << duration.count() << " ms." << std::endl;
+
         return smoothed_cell_data;
     }
 

diff --git a/include/ElementReproducingKernel.h b/include/ElementReproducingKernel.h
@@ -1,11 +1,13 @@
 #pragma once
 
 #include <Eigen/Dense>
+#include <chrono>
 #include <memory>
 #include <stdexcept>
 #include <string>
 #include <vector>
 
+#include "Constants.h"
 #include "ComputeInternalForceFunctors.h"
 #include "ElementBase.h"
 #include "ElementProcessor.h"
@@ -64,28 +66,30 @@ class ElementReproducingKernel : public ElementBase {
         search_processor.PrintNumNeighborsStats();
     }
 
+    // TODO(jake): Get rid of this wrapper class. It is only here because of some strange compiling issues that lead to a segfault.
+    // Using a wrapper class seems to fix the issue.
+    // Using ShapeFunctionsFunctorReproducingKernel directly in the compute_and_store_function_values function causes a segfault on the GPU in Release mode,
+    // but works fine in Debug mode or on the CPU. Spent a lot of time trying to figure out why, but couldn't find the issue.
+    template <size_t MaxNumNeighbors>
+    struct FunctionFunctorWrapper {
+        KOKKOS_INLINE_FUNCTION Eigen::Matrix<double, MaxNumNeighbors, 1> Values(const Eigen::Matrix<double, MaxNumNeighbors, 1> &kernel_values, const Eigen::Matrix<double, MaxNumNeighbors, 3> &shifted_neighbor_coordinates, size_t actual_num_neighbors) const {
+            return compute_node_functions_functor.Values(kernel_values, shifted_neighbor_coordinates, actual_num_neighbors);
+        }
+        aperi::ShapeFunctionsFunctorReproducingKernel<MaxNumNeighbors> compute_node_functions_functor;
+    };
+
     void ComputeAndStoreFunctionValues() {
-        // Functor for computing shape function values at nodes
-        size_t compute_node_functions_functor_size = sizeof(ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS>);
-        auto compute_node_functions_functor = (ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS> *)Kokkos::kokkos_malloc(compute_node_functions_functor_size);
-        assert(compute_node_functions_functor != nullptr);
+        aperi::CoutP0() << "   - Computing and storing function values" << std::endl;
+        auto start_function_values = std::chrono::high_resolution_clock::now();
 
-        // Initialize the functor
-        Kokkos::parallel_for(
-            "CreateReproducingKernelFunctors", 1, KOKKOS_LAMBDA(const int &) {
-                new ((ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS> *)compute_node_functions_functor) ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS>();
-            });
+        // Create an instance of the functor
+        FunctionFunctorWrapper <MAX_NODE_NUM_NEIGHBORS> compute_node_functions_functor;
 
         aperi::FunctionValueStorageProcessor function_value_storage_processor(m_mesh_data, m_part_names);
-        function_value_storage_processor.compute_and_store_function_values<MAX_NODE_NUM_NEIGHBORS>(*compute_node_functions_functor);
-
-        // Destroy the functor
-        Kokkos::parallel_for(
-            "DestroyReproducingKernelFunctors", 1, KOKKOS_LAMBDA(const int &) {
-                compute_node_functions_functor->~ShapeFunctionsFunctorReproducingKernel<MAX_NODE_NUM_NEIGHBORS>();
-            });
+        function_value_storage_processor.compute_and_store_function_values<MAX_NODE_NUM_NEIGHBORS>(compute_node_functions_functor);
 
-        Kokkos::kokkos_free(compute_node_functions_functor);
+        auto end_function_values = std::chrono::high_resolution_clock::now();
+        aperi::CoutP0() << "     Finished Computing and Storing Function Values. Time: " << std::chrono::duration_cast<std::chrono::milliseconds>(end_function_values - start_function_values).count() << " ms" << std::endl;
     }
 
     /**

diff --git a/include/IoMesh.h b/include/IoMesh.h
@@ -20,7 +20,7 @@ struct IoMeshParameters {
     int compression_level = 0;              // compression level [1..9] to use
     bool compression_shuffle = false;       // use shuffle filter prior to compressing data: true|false
     bool lower_case_variable_names = true;  // convert variable names to lowercase and replace spaces in names with underscore
-    int integer_size = 4;                   // use 4 or 8-byte integers for input and output
+    int integer_size = 8;                   // use 4 or 8-byte integers for input and output
     int initial_bucket_capacity = 0;
     int maximum_bucket_capacity = 0;
 };