From bcd2620a1bd24e8239df1d6c749aefd4122a5c13 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Thu, 4 Jul 2024 18:20:15 +0200 Subject: [PATCH 1/2] add pre_register_memory option --- .../source/building/platforms/booster_jsc.rst | 1 + docs/source/run/parameters.rst | 6 ++ src/utils/MultiBuffer.H | 3 + src/utils/MultiBuffer.cpp | 57 ++++++++++++++++++- 4 files changed, 66 insertions(+), 1 deletion(-) diff --git a/docs/source/building/platforms/booster_jsc.rst b/docs/source/building/platforms/booster_jsc.rst index ebe6b7f596..6e82f84168 100644 --- a/docs/source/building/platforms/booster_jsc.rst +++ b/docs/source/building/platforms/booster_jsc.rst @@ -26,6 +26,7 @@ Create a file ``profile.hipace`` and ``source`` it whenever you log in and want export GPUS_PER_NODE=4 # optimize CUDA compilation for A100 export AMREX_CUDA_ARCH=8.0 # 8.0 for A100, 7.0 for V100 + export CUDAFLAGS="-arch=sm_80" # sm_80 for A100 Install HiPACE++ (the first time, and whenever you want the latest version): diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst index b4e32d2a4b..665a2d202a 100644 --- a/docs/source/run/parameters.rst +++ b/docs/source/run/parameters.rst @@ -117,6 +117,12 @@ General parameters ranks there is enough capacity to store every slice to avoid a deadlock, i.e. ``comms_buffer.max_trailing_slices * nranks > nslices``. +* ``comms_buffer.pre_register_memory`` (`bool`) optional (default `false`) + On some platforms, such as JUWELS booster, the memory passed into MPI needs to be + registered to the network card, which can take a long time. When using this option, all ranks + can do this at once in initialization instead of one after another + as part of the communication pipeline. + * ``hipace.do_tiling`` (`bool`) optional (default `true`) Whether to use tiling, when running on CPU. Currently, this option only affects plasma operations (gather, push and deposition). diff --git a/src/utils/MultiBuffer.H b/src/utils/MultiBuffer.H index dfb872fdd5..d48cb614ef 100644 --- a/src/utils/MultiBuffer.H +++ b/src/utils/MultiBuffer.H @@ -130,6 +130,9 @@ private: std::array m_async_metadata_slice {}; std::array m_async_data_slice {}; + // send some dummy messages so MPI can pre-register the memory + void pre_register_memory (); + // helper functions to read 2D metadata array std::size_t get_metadata_size (); std::size_t* get_metadata_location (int slice); diff --git a/src/utils/MultiBuffer.cpp b/src/utils/MultiBuffer.cpp index 0da539264c..e0bc4e90b5 100644 --- a/src/utils/MultiBuffer.cpp +++ b/src/utils/MultiBuffer.cpp @@ -133,6 +133,13 @@ void MultiBuffer::initialize (int nslices, MultiBeam& beams, MultiLaser& laser) } } + bool do_pre_register = false; + queryWithParser(pp, "pre_register_memory", do_pre_register); + + if (do_pre_register) { + pre_register_memory(); + } + for (int p = 0; p < comm_progress::nprogress; ++p) { m_async_metadata_slice[p] = m_nslices - 1; m_async_data_slice[p] = m_nslices - 1; @@ -161,8 +168,56 @@ void MultiBuffer::initialize (int nslices, MultiBeam& beams, MultiLaser& laser) } } +void MultiBuffer::pre_register_memory () { +#ifdef AMREX_USE_MPI + HIPACE_PROFILE("MultiBuffer::pre_register_memory()"); + // On some platforms, such as JUWELS booster, the memory passed into MPI needs to be + // registered to the network card, which can take a long time. In this function, all ranks + // can do this all at once in initialization instead of one after another + // as part of the communication pipeline. + void* send_buffer = nullptr; + void* recv_buffer = nullptr; + const int count = 1024; + MPI_Request send_request = MPI_REQUEST_NULL; + MPI_Request recv_request = MPI_REQUEST_NULL; + if (!m_buffer_on_gpu) { + send_buffer = amrex::The_Pinned_Arena()->alloc(count * sizeof(storage_type)); + recv_buffer = amrex::The_Pinned_Arena()->alloc(count * sizeof(storage_type)); + } else { + send_buffer = amrex::The_Device_Arena()->alloc(count * sizeof(storage_type)); + recv_buffer = amrex::The_Device_Arena()->alloc(count * sizeof(storage_type)); + } + // send and receive dummy message + // use the same MPI functions and arguments as in the real communication + MPI_Isend( + send_buffer, + count, + amrex::ParallelDescriptor::Mpi_typemap::type(), + m_rank_send_to, + m_tag_metadata_start + m_nslices, + m_comm, + &send_request); + MPI_Irecv( + recv_buffer, + count, + amrex::ParallelDescriptor::Mpi_typemap::type(), + m_rank_receive_from, + m_tag_metadata_start + m_nslices, + m_comm, + &recv_request); + MPI_Wait(&send_request, MPI_STATUS_IGNORE); + MPI_Wait(&recv_request, MPI_STATUS_IGNORE); + if (!m_buffer_on_gpu) { + amrex::The_Pinned_Arena()->free(send_buffer); + amrex::The_Pinned_Arena()->free(recv_buffer); + } else { + amrex::The_Device_Arena()->free(send_buffer); + amrex::The_Device_Arena()->free(recv_buffer); + } +#endif +} -MultiBuffer::~MultiBuffer() { +MultiBuffer::~MultiBuffer () { #ifdef AMREX_USE_MPI // wait for sends to complete and cancel receives for (int slice = m_nslices-1; slice >= 0; --slice) { From e3a9f8aa54223d79e71d03c634bd45da756480a4 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Thu, 4 Jul 2024 20:01:06 +0200 Subject: [PATCH 2/2] remove CUDAFLAGS --- docs/source/building/platforms/booster_jsc.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/building/platforms/booster_jsc.rst b/docs/source/building/platforms/booster_jsc.rst index 6e82f84168..ebe6b7f596 100644 --- a/docs/source/building/platforms/booster_jsc.rst +++ b/docs/source/building/platforms/booster_jsc.rst @@ -26,7 +26,6 @@ Create a file ``profile.hipace`` and ``source`` it whenever you log in and want export GPUS_PER_NODE=4 # optimize CUDA compilation for A100 export AMREX_CUDA_ARCH=8.0 # 8.0 for A100, 7.0 for V100 - export CUDAFLAGS="-arch=sm_80" # sm_80 for A100 Install HiPACE++ (the first time, and whenever you want the latest version):