From bcd2620a1bd24e8239df1d6c749aefd4122a5c13 Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Thu, 4 Jul 2024 18:20:15 +0200
Subject: [PATCH 1/2] add pre_register_memory option

---
 .../source/building/platforms/booster_jsc.rst |  1 +
 docs/source/run/parameters.rst                |  6 ++
 src/utils/MultiBuffer.H                       |  3 +
 src/utils/MultiBuffer.cpp                     | 57 ++++++++++++++++++-
 4 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/docs/source/building/platforms/booster_jsc.rst b/docs/source/building/platforms/booster_jsc.rst
index ebe6b7f596..6e82f84168 100644
--- a/docs/source/building/platforms/booster_jsc.rst
+++ b/docs/source/building/platforms/booster_jsc.rst
@@ -26,6 +26,7 @@ Create a file ``profile.hipace`` and ``source`` it whenever you log in and want
    export GPUS_PER_NODE=4
    # optimize CUDA compilation for A100
    export AMREX_CUDA_ARCH=8.0 # 8.0 for A100, 7.0 for V100
+   export CUDAFLAGS="-arch=sm_80" # sm_80 for A100
 
 Install HiPACE++ (the first time, and whenever you want the latest version):
 
diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst
index b4e32d2a4b..665a2d202a 100644
--- a/docs/source/run/parameters.rst
+++ b/docs/source/run/parameters.rst
@@ -117,6 +117,12 @@ General parameters
     ranks there is enough capacity to store every slice to avoid a deadlock, i.e.
     ``comms_buffer.max_trailing_slices * nranks > nslices``.
 
+* ``comms_buffer.pre_register_memory`` (`bool`) optional (default `false`)
+    On some platforms, such as JUWELS booster, the memory passed into MPI needs to be
+    registered to the network card, which can take a long time. When using this option, all ranks
+    can do this at once in initialization instead of one after another
+    as part of the communication pipeline.
+
 * ``hipace.do_tiling`` (`bool`) optional (default `true`)
     Whether to use tiling, when running on CPU.
     Currently, this option only affects plasma operations (gather, push and deposition).
diff --git a/src/utils/MultiBuffer.H b/src/utils/MultiBuffer.H
index dfb872fdd5..d48cb614ef 100644
--- a/src/utils/MultiBuffer.H
+++ b/src/utils/MultiBuffer.H
@@ -130,6 +130,9 @@ private:
     std::array<int, comm_progress::nprogress> m_async_metadata_slice {};
     std::array<int, comm_progress::nprogress> m_async_data_slice {};
 
+    // send some dummy messages so MPI can pre-register the memory
+    void pre_register_memory ();
+
     // helper functions to read 2D metadata array
     std::size_t get_metadata_size ();
     std::size_t* get_metadata_location (int slice);
diff --git a/src/utils/MultiBuffer.cpp b/src/utils/MultiBuffer.cpp
index 0da539264c..e0bc4e90b5 100644
--- a/src/utils/MultiBuffer.cpp
+++ b/src/utils/MultiBuffer.cpp
@@ -133,6 +133,13 @@ void MultiBuffer::initialize (int nslices, MultiBeam& beams, MultiLaser& laser)
         }
     }
 
+    bool do_pre_register = false;
+    queryWithParser(pp, "pre_register_memory", do_pre_register);
+
+    if (do_pre_register) {
+        pre_register_memory();
+    }
+
     for (int p = 0; p < comm_progress::nprogress; ++p) {
         m_async_metadata_slice[p] = m_nslices - 1;
         m_async_data_slice[p] = m_nslices - 1;
@@ -161,8 +168,56 @@ void MultiBuffer::initialize (int nslices, MultiBeam& beams, MultiLaser& laser)
     }
 }
 
+void MultiBuffer::pre_register_memory () {
+#ifdef AMREX_USE_MPI
+    HIPACE_PROFILE("MultiBuffer::pre_register_memory()");
+    // On some platforms, such as JUWELS booster, the memory passed into MPI needs to be
+    // registered to the network card, which can take a long time. In this function, all ranks
+    // can do this all at once in initialization instead of one after another
+    // as part of the communication pipeline.
+    void* send_buffer = nullptr;
+    void* recv_buffer = nullptr;
+    const int count = 1024;
+    MPI_Request send_request = MPI_REQUEST_NULL;
+    MPI_Request recv_request = MPI_REQUEST_NULL;
+    if (!m_buffer_on_gpu) {
+        send_buffer = amrex::The_Pinned_Arena()->alloc(count * sizeof(storage_type));
+        recv_buffer = amrex::The_Pinned_Arena()->alloc(count * sizeof(storage_type));
+    } else {
+        send_buffer = amrex::The_Device_Arena()->alloc(count * sizeof(storage_type));
+        recv_buffer = amrex::The_Device_Arena()->alloc(count * sizeof(storage_type));
+    }
+    // send and receive dummy message
+    // use the same MPI functions and arguments as in the real communication
+    MPI_Isend(
+        send_buffer,
+        count,
+        amrex::ParallelDescriptor::Mpi_typemap<storage_type>::type(),
+        m_rank_send_to,
+        m_tag_metadata_start + m_nslices,
+        m_comm,
+        &send_request);
+    MPI_Irecv(
+        recv_buffer,
+        count,
+        amrex::ParallelDescriptor::Mpi_typemap<storage_type>::type(),
+        m_rank_receive_from,
+        m_tag_metadata_start + m_nslices,
+        m_comm,
+        &recv_request);
+    MPI_Wait(&send_request, MPI_STATUS_IGNORE);
+    MPI_Wait(&recv_request, MPI_STATUS_IGNORE);
+    if (!m_buffer_on_gpu) {
+        amrex::The_Pinned_Arena()->free(send_buffer);
+        amrex::The_Pinned_Arena()->free(recv_buffer);
+    } else {
+        amrex::The_Device_Arena()->free(send_buffer);
+        amrex::The_Device_Arena()->free(recv_buffer);
+    }
+#endif
+}
 
-MultiBuffer::~MultiBuffer() {
+MultiBuffer::~MultiBuffer () {
 #ifdef AMREX_USE_MPI
     // wait for sends to complete and cancel receives
     for (int slice = m_nslices-1; slice >= 0; --slice) {

From e3a9f8aa54223d79e71d03c634bd45da756480a4 Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Thu, 4 Jul 2024 20:01:06 +0200
Subject: [PATCH 2/2] remove CUDAFLAGS

---
 docs/source/building/platforms/booster_jsc.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/source/building/platforms/booster_jsc.rst b/docs/source/building/platforms/booster_jsc.rst
index 6e82f84168..ebe6b7f596 100644
--- a/docs/source/building/platforms/booster_jsc.rst
+++ b/docs/source/building/platforms/booster_jsc.rst
@@ -26,7 +26,6 @@ Create a file ``profile.hipace`` and ``source`` it whenever you log in and want
    export GPUS_PER_NODE=4
    # optimize CUDA compilation for A100
    export AMREX_CUDA_ARCH=8.0 # 8.0 for A100, 7.0 for V100
-   export CUDAFLAGS="-arch=sm_80" # sm_80 for A100
 
 Install HiPACE++ (the first time, and whenever you want the latest version):