Skip to content

Commit

Permalink
Merge branch 'develop' into pr-from-fork/900
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbeckingsale committed Sep 27, 2024
2 parents 84896ce + 17f31fd commit cd4f817
Show file tree
Hide file tree
Showing 13 changed files with 120 additions and 53 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ CMakeFiles
.devcontainer
*spack.lock*
.spack_env/*/.spack-env
*.orig
4 changes: 2 additions & 2 deletions .gitlab/jobs/corona.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
# This job intentionally tests our umpire package.py because although this job does not
# explicitly have the ~tools, the package.py should still disable tools from being built.
###
rocmcc_5_7_0_hip_openmp_device_alloc:
rocmcc_5_7_1_hip_openmp_device_alloc:
variables:
SPEC: "~shared +fortran +openmp +rocm +device_alloc tests=basic amdgpu_target=gfx906 %rocmcc@=5.7.0 ^hip@5.7.0"
SPEC: "~shared +fortran +openmp +rocm +device_alloc tests=basic amdgpu_target=gfx906 %rocmcc@=5.7.1 ^hip@5.7.1"
extends: .job_on_corona

27 changes: 9 additions & 18 deletions .gitlab/jobs/lassen.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,7 @@
# We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
# the comparison with the original job is easier.

# Overriden to increase allocation
xl_2022_08_19_gcc_8_3_1_cuda_11_2_0:
variables:
SPEC: "${PROJECT_LASSEN_VARIANTS} +cuda %xl@=16.1.1.12.gcc.8.3.1 ^cuda@11.2.0+allow-unsupported-compilers ${PROJECT_LASSEN_DEPS}"
MODULE_LIST: "cuda/11.2.0"
LASSEN_JOB_ALLOC: "1 -W 20 -q pci"
extends: .job_on_lassen

# No overridden jobs so far

############
# Extra jobs
Expand Down Expand Up @@ -72,12 +65,6 @@ gcc_8_3_1_dev_benchmarks:
SPEC: "~shared +dev_benchmarks +tools build_type=Release %gcc@=8.3.1"
extends: .job_on_lassen

xl_2022_08_19_default_omp_target:
variables:
SPEC: "~shared +tools +openmp +openmp_target tests=basic %xl@=16.1.1.12"
allow_failure: true
extends: .job_on_lassen

gcc_8_3_1_numa:
variables:
SPEC: "~shared +fortran +numa +tools tests=basic %gcc@=8.3.1"
Expand All @@ -96,15 +83,19 @@ gcc_8_3_1_tpls:
SPEC: "~shared +fortran +tools tests=basic %gcc@=8.3.1"
extends: .job_on_lassen

gcc_11_2_1_tpls:
variables:
SPEC: "~shared +fortran +tools tests=basic %gcc@=11.2.1"
extends: .job_on_lassen

ibm_clang_14_0_5_gcc_8_3_1_cuda_11_7_0_tpls:
variables:
SPEC: "~shared +fortran +cuda +tools tests=basic %clang@=14.0.5.ibm.gcc.8.3.1 ^cuda@11.7.0+allow-unsupported-compilers"
MODULE_LIST: "cuda/11.7.0"
extends: .job_on_lassen

xl_2022_08_19_gcc_8_3_1_cuda_11_2_tpls:
ibm_clang_16_0_6_gcc_11_2_1_cuda_11_8_tpls:
variables:
SPEC: "~shared +fortran +cuda +tools tests=basic %xl@=16.1.1.12.gcc.8.3.1 ^cuda@11.7.0+allow-unsupported-compilers"
MODULE_LIST: "cuda/11.7.0"
LASSEN_JOB_ALLOC: "1 -W 20 -q pci"
SPEC: "~shared +fortran +cuda +tools tests=basic %clang@=16.0.6.ibm.cuda.11.8.0.gcc.11.2.1 ^cuda@11.8.0+allow-unsupported-compilers"
MODULE_LIST: "cuda/11.8.0"
extends: .job_on_lassen
7 changes: 0 additions & 7 deletions .gitlab/jobs/poodle.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,6 @@
# We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
# the comparison with the original job is easier.

# Allow failure due to compiler internal error building wrapfumpire.f
intel_2022_1_0:
variables:
SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}"
extends: .job_on_poodle
allow_failure: true

############
# Extra jobs
############
Expand Down
7 changes: 0 additions & 7 deletions .gitlab/jobs/ruby.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,6 @@
# We keep ${PROJECT_<MACHINE>_VARIANTS} and ${PROJECT_<MACHINE>_DEPS} So that
# the comparison with the original job is easier.

# Allow failure due to compiler internal error building wrapfumpire.f
intel_2022_1_0:
variables:
SPEC: "${PROJECT_RUBY_VARIANTS} %intel@=2022.1.0 ${PROJECT_RUBY_DEPS}"
extends: .job_on_ruby
allow_failure: true

############
# Extra jobs
############
Expand Down
8 changes: 4 additions & 4 deletions .gitlab/jobs/tioga.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
# the comparison with the original job is easier.

# We override the cce job because we can’t use +device-alloc with it
cce_16_0_1:
cce_18_0_0:
variables:
SPEC: "~shared +fortran tests=basic %cce@=16.0.1"
SPEC: "~shared +fortran tests=basic %cce@=18.0.0"
extends: .job_on_tioga

############
Expand All @@ -33,8 +33,8 @@ cce_16_0_1:
# This job intentionally tests our umpire package.py because although this job does not
# explicitly have the ~tools, the package.py should still prevent tools from being built.
###
rocmcc_6_1_1_hip_openmp_device_alloc:
rocmcc_6_2_0_hip_openmp_device_alloc:
variables:
SPEC: "~shared +fortran +openmp +rocm +device_alloc tests=basic amdgpu_target=gfx90a %rocmcc@=6.1.1 ^hip@6.1.1"
SPEC: "~shared +fortran +openmp +rocm +device_alloc tests=basic amdgpu_target=gfx90a %rocmcc@=6.2.0 ^hip@6.2.0"
extends: .job_on_tioga

3 changes: 2 additions & 1 deletion docs/sphinx/cookbook.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,5 @@ that have introspection disabled for improved performance, and applying CUDA
cookbook/mixed_pool.rst
cookbook/thread_safe.rst
cookbook/file_allocation.rst
cookbook/strategy_name.rst
cookbook/shared_memory.rst
cookbook/strategy_name.rst
79 changes: 79 additions & 0 deletions docs/sphinx/cookbook/shared_memory.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
.. _shared_memory:

=======================
Using IPC Shared Memory
=======================

Umpire supports the use of Inter-Process Communication (IPC) Shared Memory on the HOST memory resource. IPC Shared Memory refers to
the mechanisms that allow processes to communicate with each other and synchronize their actions and involves a method where multiple
processes can access a common memory space.

To use Umpire's IPC Shared Memory allocators, the ``UMPIRE_ENABLE_IPC_SHARED_MEMORY`` flag
should be set to ``On``. Note that you can use IPC Shared Memory with MPI enabled or disabled.

First, to get started with the shared memory allocator, set up the traits. For example:

.. code-block:: cpp
auto traits{umpire::get_default_resource_traits("SHARED")};
The ``traits`` above is a struct of different properties for your shared allocator. You can
set the maximum size of the allocator with ``traits.size`` and set the scope of the allocator.

For example, you can set the scope to socket:

.. code-block:: cpp
traits.scope = umpire::MemoryResourceTraits::shared_scope::socket;
However, by default the scope will be set to "node".

Next, create the shared memory allocator:

.. code-block:: cpp
auto node_allocator{rm.makeResource("SHARED::node_allocator", traits)};
.. note::
The name of the Shared Memory allocators MUST have "SHARED" in the name. This will help
Umpire distinguish the allocators as Shared Memory allocators. It is also used for discovery
by other ranks on node.

Now you can allocate and deallocate shared memory with:

.. code-block:: cpp
void* ptr{node_allocator.allocate("allocation_name_2", sizeof(uint64_t))};
...
node_allocator.deallocate(ptr);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Important Notes About Shared Memory
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Because we are dealing with shared memory there are a few unique characteristics of the Shared Memory allocators
which set it apart from other Umpire allocators.

1. Once you allocate shared memory, that block of memory is fixed. If you need a bigger size, you will have to create a new one.
2. If you want to see how much memory is available for a shared memory allocator, use the ``getActualSize()`` function.
3. File descriptors are used for the shared memory. These files will be under ``/dev/shm``.
4. Although Umpire does not need to have MPI enabled in order to provide IPC Shared Memory, if users wish to associate shared memory with MPI communicators, Umpire will need to be built with MPI enabled.

There are a few helper functions provided in the ``Umpire.hpp`` header that will be useful when working with
Shared Memory allocators. For example, you can grab the MPI communicator for a particular Shared Memory allocator with:

.. code-block:: cpp
MPI_Comm shared_allocator_comm = umpire::get_communicator_for_allocator(node_allocator, MPI_COMM_WORLD);
Note that the ``node_allocator`` is the Shared Memory allocator we created above.
Additionally, we can double check that an allocator has the ``SHARED`` memory resource by asserting:

.. code-block:: cpp
UMPIRE_ASSERT(node_allocator.getAllocationStrategy()->getTraits().resource == umpire::MemoryResourceTraits::resource_type::shared);
You can see a full example here:

.. literalinclude:: ../../../examples/cookbook/recipe_shared_memory.cpp
:language: cpp
2 changes: 1 addition & 1 deletion docs/sphinx/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ you will want to access data:
auto& rm = umpire::ResourceManager::getInstance();
umpire::Allocator allocator = rm.getAllocator("HOST");
float* my_data = static_cast<float*>(allocator.allocate(100*sizeof(float));
float* my_data = static_cast<float*>(allocator.allocate(100*sizeof(float)));
This code grabs the default allocator for the host memory, and uses it to
Expand Down
6 changes: 2 additions & 4 deletions scripts/gitlab/build_and_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,7 @@ fi

if [[ -n ${module_list} ]]
then
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
echo "~~~~~ Modules to load: ${module_list}"
echo "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
timed_message "Modules to load: ${module_list}"
module load ${module_list}
fi

Expand All @@ -79,7 +77,7 @@ then
prefix="${prefix}-${job_unique_id}"
else
# We set the prefix in the parent directory so that spack dependencies are not installed inside the source tree.
prefix="$(pwd)/../spack-and-build-root"
prefix="${project_dir}/../spack-and-build-root"
fi

echo "Creating directory ${prefix}"
Expand Down
19 changes: 15 additions & 4 deletions src/umpire/resource/MemoryResourceTypes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
#define UMPIRE_MemoryResourceTypes_HPP

#include <cstddef>
#include <regex>
#include <string>

#include "umpire/config.hpp"
#include "umpire/util/error.hpp"

#if defined(UMPIRE_ENABLE_CUDA)
#include <cuda_runtime_api.h>
#else
#include <regex>
#endif /* UMPIRE_ENABLE_CUDA */

#if defined(UMPIRE_ENABLE_HIP)
Expand Down Expand Up @@ -106,14 +107,23 @@ inline MemoryResourceType string_to_resource(const std::string& resource)

inline int resource_to_device_id(const std::string& resource)
{
int device_id{0};

#if defined(UMPIRE_ENABLE_CUDA)
if (resource.find("::") != std::string::npos) {
device_id = std::stoi(resource.substr(resource.find("::") + 2));
}
#else
const std::regex id_regex{R"(.*::(\d+))", std::regex_constants::ECMAScript | std::regex_constants::optimize};
std::smatch m;

int device_id{0};
if (std::regex_match(resource, m, id_regex)) {
device_id = std::stoi(m[1]);
} else {
// get the device bound to the current process
}
#endif
else {
// get the device bound to the current process

#if defined(UMPIRE_ENABLE_CUDA)
cudaGetDevice(&device_id);
#endif /* UMPIRE_ENABLE_CUDA */
Expand All @@ -122,6 +132,7 @@ inline int resource_to_device_id(const std::string& resource)
hipGetDevice(&device_id);
#endif /* UMPIRE_ENABLE_HIP */
}

return device_id;
}

Expand Down
8 changes: 4 additions & 4 deletions src/umpire/strategy/QuickPool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ void QuickPool::deallocate(void* ptr, std::size_t UMPIRE_UNUSED_ARG(size))

if (chunk->prev && chunk->prev->free == true) {
auto prev = chunk->prev;
UMPIRE_LOG(Debug, "Removing chunk" << prev << " from size map");
UMPIRE_LOG(Debug, "Removing chunk " << prev << " from size map");

m_size_map.erase(prev->size_map_it);

Expand All @@ -157,7 +157,7 @@ void QuickPool::deallocate(void* ptr, std::size_t UMPIRE_UNUSED_ARG(size))
if (prev->next)
prev->next->prev = prev;

UMPIRE_LOG(Debug, "Merging with prev" << prev << " and " << chunk);
UMPIRE_LOG(Debug, "Merging with prev " << prev << " and " << chunk);
UMPIRE_LOG(Debug, "New size: " << prev->size);

m_chunk_pool.deallocate(chunk);
Expand All @@ -171,10 +171,10 @@ void QuickPool::deallocate(void* ptr, std::size_t UMPIRE_UNUSED_ARG(size))
if (chunk->next)
chunk->next->prev = chunk;

UMPIRE_LOG(Debug, "Merging with next" << chunk << " and " << next);
UMPIRE_LOG(Debug, "Merging with next " << chunk << " and " << next);
UMPIRE_LOG(Debug, "New size: " << chunk->size);

UMPIRE_LOG(Debug, "Removing chunk" << next << " from size map");
UMPIRE_LOG(Debug, "Removing chunk " << next << " from size map");
m_size_map.erase(next->size_map_it);

m_chunk_pool.deallocate(next);
Expand Down

0 comments on commit cd4f817

Please sign in to comment.