Merge branch 'develop' into pr-from-fork/900

LLNL · Sep 27, 2024 · cd4f817 · cd4f817
2 parents 84896ce + 17f31fd
commit cd4f817
Show file tree

Hide file tree

Showing 13 changed files with 120 additions and 53 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ CMakeFiles
 .devcontainer
 *spack.lock*
 .spack_env/*/.spack-env
+*.orig
diff --git a/.gitlab/jobs/corona.yml b/.gitlab/jobs/corona.yml
@@ -30,8 +30,8 @@
 # This job intentionally tests our umpire package.py because although this job does not
 # explicitly have the ~tools, the package.py should still disable tools from being built.
 ###
-rocmcc_5_7_0_hip_openmp_device_alloc:
+rocmcc_5_7_1_hip_openmp_device_alloc:
   variables:
-    SPEC: "~shared +fortran +openmp +rocm +device_alloc tests=basic amdgpu_target=gfx906 %rocmcc@=5.7.0 ^hip@5.7.0"
+    SPEC: "~shared +fortran +openmp +rocm +device_alloc tests=basic amdgpu_target=gfx906 %rocmcc@=5.7.1 ^hip@5.7.1"
   extends: .job_on_corona
 
diff --git a/.gitlab/jobs/lassen.yml b/.gitlab/jobs/lassen.yml
@@ -17,14 +17,7 @@
 # We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
 # the comparison with the original job is easier.
 
-# Overriden to increase allocation
-xl_2022_08_19_gcc_8_3_1_cuda_11_2_0:
-  variables:
-    SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda %xl@=16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}"
-    MODULE_LIST: "cuda/11.2.0"
-    LASSEN_JOB_ALLOC: "1 -W 20 -q pci"
-  extends: .job_on_lassen
-
+# No overridden jobs so far
 
 ############
 # Extra jobs
@@ -72,12 +65,6 @@ gcc_8_3_1_dev_benchmarks:
     SPEC: "~shared +dev_benchmarks +tools build_type=Release %gcc@=8.3.1"
   extends: .job_on_lassen
 
-xl_2022_08_19_default_omp_target:
-  variables:
-    SPEC: "~shared +tools +openmp +openmp_target tests=basic %xl@=16.1.1.12"
-  allow_failure: true
-  extends: .job_on_lassen
-
 gcc_8_3_1_numa:
   variables:
     SPEC: "~shared +fortran +numa +tools tests=basic %gcc@=8.3.1"
@@ -96,15 +83,19 @@ gcc_8_3_1_tpls:
     SPEC: "~shared +fortran +tools tests=basic %gcc@=8.3.1"
   extends: .job_on_lassen
 
+gcc_11_2_1_tpls:
+  variables:
+    SPEC: "~shared +fortran +tools tests=basic %gcc@=11.2.1"
+  extends: .job_on_lassen
+
 ibm_clang_14_0_5_gcc_8_3_1_cuda_11_7_0_tpls:
   variables:
     SPEC: "~shared +fortran +cuda +tools tests=basic %clang@=14.0.5.ibm.gcc.8.3.1 ^cuda@11.7.0+allow-unsupported-compilers"
     MODULE_LIST: "cuda/11.7.0"
   extends: .job_on_lassen
 
-xl_2022_08_19_gcc_8_3_1_cuda_11_2_tpls:
+ibm_clang_16_0_6_gcc_11_2_1_cuda_11_8_tpls:
   variables:
-    SPEC: "~shared +fortran +cuda +tools tests=basic %xl@=16.1.1.12.gcc.8.3.1 ^cuda@11.7.0+allow-unsupported-compilers"
-    MODULE_LIST: "cuda/11.7.0"
-    LASSEN_JOB_ALLOC: "1 -W 20 -q pci"
+    SPEC: "~shared +fortran +cuda +tools tests=basic %clang@=16.0.6.ibm.cuda.11.8.0.gcc.11.2.1 ^cuda@11.8.0+allow-unsupported-compilers"
+    MODULE_LIST: "cuda/11.8.0"
   extends: .job_on_lassen
diff --git a/.gitlab/jobs/poodle.yml b/.gitlab/jobs/poodle.yml
@@ -17,13 +17,6 @@
 # We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
 # the comparison with the original job is easier.
 
-# Allow failure due to compiler internal error building wrapfumpire.f
-intel_2022_1_0:
-  variables:
-    SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}"
-  extends: .job_on_poodle
-  allow_failure: true
-
 ############
 # Extra jobs
 ############

diff --git a/.gitlab/jobs/ruby.yml b/.gitlab/jobs/ruby.yml
@@ -17,13 +17,6 @@
 # We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
 # the comparison with the original job is easier.
 
-# Allow failure due to compiler internal error building wrapfumpire.f
-intel_2022_1_0:
-  variables:
-    SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}"
-  extends: .job_on_ruby
-  allow_failure: true
-
 ############
 # Extra jobs
 ############

diff --git a/.gitlab/jobs/tioga.yml b/.gitlab/jobs/tioga.yml
@@ -17,9 +17,9 @@
 # the comparison with the original job is easier.
 
 # We override the cce job because we can’t use +device-alloc with it
-cce_16_0_1:
+cce_18_0_0:
   variables:
-    SPEC: "~shared +fortran tests=basic %cce@=16.0.1"
+    SPEC: "~shared +fortran tests=basic %cce@=18.0.0"
   extends: .job_on_tioga
 
 ############
@@ -33,8 +33,8 @@ cce_16_0_1:
 # This job intentionally tests our umpire package.py because although this job does not
 # explicitly have the ~tools, the package.py should still prevent tools from being built.
 ###
-rocmcc_6_1_1_hip_openmp_device_alloc:
+rocmcc_6_2_0_hip_openmp_device_alloc:
   variables:
-    SPEC: "~shared +fortran +openmp +rocm +device_alloc tests=basic amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1"
+    SPEC: "~shared +fortran +openmp +rocm +device_alloc tests=basic amdgpu_target=gfx90a %rocmcc@=6.2.0 ^hip@6.2.0"
   extends: .job_on_tioga
 
diff --git a/docs/sphinx/cookbook.rst b/docs/sphinx/cookbook.rst
@@ -29,4 +29,5 @@ that have introspection disabled for improved performance, and applying CUDA
    cookbook/mixed_pool.rst
    cookbook/thread_safe.rst
    cookbook/file_allocation.rst
-   cookbook/strategy_name.rst
+   cookbook/shared_memory.rst
+   cookbook/strategy_name.rst
diff --git a/docs/sphinx/cookbook/shared_memory.rst b/docs/sphinx/cookbook/shared_memory.rst
@@ -0,0 +1,79 @@
+.. _shared_memory:
+
+=======================
+Using IPC Shared Memory 
+=======================
+
+Umpire supports the use of Inter-Process Communication (IPC) Shared Memory on the HOST memory resource. IPC Shared Memory refers to 
+the mechanisms that allow processes to communicate with each other and synchronize their actions and involves a method where multiple 
+processes can access a common memory space.
+
+To use Umpire's IPC Shared Memory allocators, the ``UMPIRE_ENABLE_IPC_SHARED_MEMORY`` flag 
+should be set to ``On``. Note that you can use IPC Shared Memory with MPI enabled or disabled.
+
+First, to get started with the shared memory allocator, set up the traits. For example:
+
+.. code-block:: cpp
+
+    auto traits{umpire::get_default_resource_traits("SHARED")};
+
+The ``traits`` above is a struct of different properties for your shared allocator. You can
+set the maximum size of the allocator with ``traits.size`` and set the scope of the allocator.
+
+For example, you can set the scope to socket:
+
+.. code-block:: cpp
+
+   traits.scope = umpire::MemoryResourceTraits::shared_scope::socket;
+
+However, by default the scope will be set to "node".
+
+Next, create the shared memory allocator:
+
+.. code-block:: cpp
+
+   auto node_allocator{rm.makeResource("SHARED::node_allocator", traits)};
+
+.. note::
+   The name of the Shared Memory allocators MUST have "SHARED" in the name. This will help
+   Umpire distinguish the allocators as Shared Memory allocators. It is also used for discovery 
+   by other ranks on node.
+
+Now you can allocate and deallocate shared memory with:
+
+.. code-block:: cpp
+
+   void* ptr{node_allocator.allocate("allocation_name_2", sizeof(uint64_t))};
+   ...
+   node_allocator.deallocate(ptr);
+
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Important Notes About Shared Memory
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Because we are dealing with shared memory there are a few unique characteristics of the Shared Memory allocators
+which set it apart from other Umpire allocators.
+
+1. Once you allocate shared memory, that block of memory is fixed. If you need a bigger size, you will have to create a new one.
+2. If you want to see how much memory is available for a shared memory allocator, use the ``getActualSize()`` function.
+3. File descriptors are used for the shared memory. These files will be under ``/dev/shm``.
+4. Although Umpire does not need to have MPI enabled in order to provide IPC Shared Memory, if users wish to associate shared memory with MPI communicators, Umpire will need to be built with MPI enabled.
+
+There are a few helper functions provided in the ``Umpire.hpp`` header that will be useful when working with 
+Shared Memory allocators. For example, you can grab the MPI communicator for a particular Shared Memory allocator with:
+
+.. code-block:: cpp
+
+   MPI_Comm shared_allocator_comm = umpire::get_communicator_for_allocator(node_allocator, MPI_COMM_WORLD);
+
+Note that the ``node_allocator`` is the Shared Memory allocator we created above.
+Additionally, we can double check that an allocator has the ``SHARED`` memory resource by asserting:
+
+.. code-block:: cpp
+
+  UMPIRE_ASSERT(node_allocator.getAllocationStrategy()->getTraits().resource == umpire::MemoryResourceTraits::resource_type::shared);
+
+You can see a full example here:
+
+.. literalinclude:: ../../../examples/cookbook/recipe_shared_memory.cpp
+   :language: cpp
diff --git a/docs/sphinx/getting_started.rst b/docs/sphinx/getting_started.rst
@@ -78,7 +78,7 @@ you will want to access data:
   auto& rm = umpire::ResourceManager::getInstance();
   umpire::Allocator allocator = rm.getAllocator("HOST");
 
-  float* my_data = static_cast<float*>(allocator.allocate(100*sizeof(float));
+  float* my_data = static_cast<float*>(allocator.allocate(100*sizeof(float)));
 
 
 This code grabs the default allocator for the host memory, and uses it to

diff --git a/scripts/gitlab/build_and_test.sh b/scripts/gitlab/build_and_test.sh
@@ -57,9 +57,7 @@ fi
 
 if [[ -n ${module_list} ]]
 then
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
-    echo "~~~~~ Modules to load: ${module_list}"
-    echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
+    timed_message "Modules to load: ${module_list}"
     module load ${module_list}
 fi
 
@@ -79,7 +77,7 @@ then
     prefix="${prefix}-${job_unique_id}"
 else
     # We set the prefix in the parent directory so that spack dependencies are not installed inside the source tree.
-    prefix="$(pwd)/../spack-and-build-root"
+    prefix="${project_dir}/../spack-and-build-root"
 fi
 
 echo "Creating directory ${prefix}"

diff --git a/scripts/radiuss-spack-configs b/scripts/radiuss-spack-configs
diff --git a/src/umpire/resource/MemoryResourceTypes.hpp b/src/umpire/resource/MemoryResourceTypes.hpp
@@ -8,14 +8,15 @@
 #define UMPIRE_MemoryResourceTypes_HPP
 
 #include <cstddef>
-#include <regex>
 #include <string>
 
 #include "umpire/config.hpp"
 #include "umpire/util/error.hpp"
 
 #if defined(UMPIRE_ENABLE_CUDA)
 #include <cuda_runtime_api.h>
+#else
+#include <regex>
 #endif /* UMPIRE_ENABLE_CUDA */
 
 #if defined(UMPIRE_ENABLE_HIP)
@@ -106,14 +107,23 @@ inline MemoryResourceType string_to_resource(const std::string& resource)
 
 inline int resource_to_device_id(const std::string& resource)
 {
+  int device_id{0};
+
+#if defined(UMPIRE_ENABLE_CUDA)
+  if (resource.find("::") != std::string::npos) {
+    device_id = std::stoi(resource.substr(resource.find("::") + 2));
+  }
+#else
   const std::regex id_regex{R"(.*::(\d+))", std::regex_constants::ECMAScript | std::regex_constants::optimize};
   std::smatch m;
 
-  int device_id{0};
   if (std::regex_match(resource, m, id_regex)) {
     device_id = std::stoi(m[1]);
-  } else {
-// get the device bound to the current process
+  }
+#endif
+  else {
+    // get the device bound to the current process
+
 #if defined(UMPIRE_ENABLE_CUDA)
     cudaGetDevice(&device_id);
 #endif /* UMPIRE_ENABLE_CUDA */
@@ -122,6 +132,7 @@ inline int resource_to_device_id(const std::string& resource)
     hipGetDevice(&device_id);
 #endif /* UMPIRE_ENABLE_HIP */
   }
+
   return device_id;
 }
 

diff --git a/src/umpire/strategy/QuickPool.cpp b/src/umpire/strategy/QuickPool.cpp
@@ -147,7 +147,7 @@ void QuickPool::deallocate(void* ptr, std::size_t UMPIRE_UNUSED_ARG(size))
 
   if (chunk->prev && chunk->prev->free == true) {
     auto prev = chunk->prev;
-    UMPIRE_LOG(Debug, "Removing chunk" << prev << " from size map");
+    UMPIRE_LOG(Debug, "Removing chunk " << prev << " from size map");
 
     m_size_map.erase(prev->size_map_it);
 
@@ -157,7 +157,7 @@ void QuickPool::deallocate(void* ptr, std::size_t UMPIRE_UNUSED_ARG(size))
     if (prev->next)
       prev->next->prev = prev;
 
-    UMPIRE_LOG(Debug, "Merging with prev" << prev << " and " << chunk);
+    UMPIRE_LOG(Debug, "Merging with prev " << prev << " and " << chunk);
     UMPIRE_LOG(Debug, "New size: " << prev->size);
 
     m_chunk_pool.deallocate(chunk);
@@ -171,10 +171,10 @@ void QuickPool::deallocate(void* ptr, std::size_t UMPIRE_UNUSED_ARG(size))
     if (chunk->next)
       chunk->next->prev = chunk;
 
-    UMPIRE_LOG(Debug, "Merging with next" << chunk << " and " << next);
+    UMPIRE_LOG(Debug, "Merging with next " << chunk << " and " << next);
     UMPIRE_LOG(Debug, "New size: " << chunk->size);
 
-    UMPIRE_LOG(Debug, "Removing chunk" << next << " from size map");
+    UMPIRE_LOG(Debug, "Removing chunk " << next << " from size map");
     m_size_map.erase(next->size_map_it);
 
     m_chunk_pool.deallocate(next);
+61 −31		blueos_3_ppc64le_ib/spack.yaml
+2 −2		gitlab/radiuss-jobs/corona.yml
+13 −8		gitlab/radiuss-jobs/lassen.yml
+2 −7		gitlab/radiuss-jobs/poodle.yml
+2 −7		gitlab/radiuss-jobs/ruby.yml
+4 −4		gitlab/radiuss-jobs/tioga.yml
+0 −729		toss_3_x86_64_ib/spack.yaml
+81 −99		toss_4_x86_64_ib/spack.yaml
+139 −70		toss_4_x86_64_ib_cray/spack.yaml