From be83213ada60d4d8ad235accdadbd88f48d295b3 Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Wed, 14 Jun 2023 18:28:29 +0200
Subject: [PATCH 1/6] option to allocate mpi buffer in device memory

---
 docs/source/run/parameters.rst |  6 ++++++
 src/Hipace.H                   |  4 ++++
 src/Hipace.cpp                 | 20 +++++++++++---------
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst
index d642bebe53..ca470701fe 100644
--- a/docs/source/run/parameters.rst
+++ b/docs/source/run/parameters.rst
@@ -193,6 +193,12 @@ General parameters
     Currently, this option only affects plasma operations (gather, push and deposition).
     The tile size can be set with ``plasmas.sort_bin_size``.
 
+* ``hipace.m_comms_buffer_on_gpu`` (`bool`) optional (default `false`)
+    If the buffers used for MPI communication should be allocated on the GPU (device memory).
+    By default they will be allocated on the CPU (pinned memory).
+    Setting this option to true is necessary to take advatige of GPU-Enabled MPI, however for this
+    additional enviroment variables need to be set depending on the system.
+
 * ``hipace.do_beam_jz_minus_rho`` (`bool`) optional (default `0`)
     Whether the beam contribution to :math:`j_z-c\rho` is calculated and used when solving for Psi (used to caculate the transverse fields Ex-By and Ey+Bx).
     if 0, this term is assumed to be 0 (a good approximation for an ultra-relativistic beam in the z direction with small transverse momentum).
diff --git a/src/Hipace.H b/src/Hipace.H
index ed662948bd..85dc0b89a1 100644
--- a/src/Hipace.H
+++ b/src/Hipace.H
@@ -372,6 +372,10 @@ public:
     amrex::Parser m_salame_parser;
     /** Function to get the target Ez field for SALAME */
     amrex::ParserExecutor<3> m_salame_target_func;
+    /** If MPI communication buffers should be allocated in device memory */
+    bool m_comms_buffer_on_gpu = false;
+    /** Arena for MPI communications */
+    amrex::Arena* m_comms_arena = nullptr;
 
     /** \brief Check that the ghost beam particles are in the proper box, and invalidate
      * those not in the right slice.
diff --git a/src/Hipace.cpp b/src/Hipace.cpp
index 99d77bc65d..11c29ed3a6 100644
--- a/src/Hipace.cpp
+++ b/src/Hipace.cpp
@@ -165,6 +165,8 @@ Hipace::Hipace () :
     queryWithParser(pph, "background_density_SI", m_background_density_SI);
 
 #ifdef AMREX_USE_MPI
+    queryWithParser(pph, "comms_buffer_on_gpu", m_comms_buffer_on_gpu);
+    m_comms_arena = m_comms_buffer_on_gpu ? amrex::The_Device_Arena() : amrex::The_Pinned_Arena();
     queryWithParser(pph, "skip_empty_comms", m_skip_empty_comms);
     int myproc = amrex::ParallelDescriptor::MyProc();
     m_rank_z = myproc/(m_numprocs_x*m_numprocs_y);
@@ -1065,7 +1067,7 @@ Hipace::Wait (const int step, int it, bool only_ghost)
         const amrex::Long psize = sizeof(amrex::ParticleReal)*BeamParticleContainer::NAR
                                   + sizeof(int)*BeamParticleContainer::NAI;
         const amrex::Long buffer_size = psize*np_total;
-        auto recv_buffer = (char*)amrex::The_Pinned_Arena()->alloc(buffer_size);
+        auto recv_buffer = (char*)m_comms_arena->alloc(buffer_size);
 
         MPI_Status status;
         const int loc_pcomm_z_tag = only_ghost ? pcomm_z_tag_ghost : pcomm_z_tag;
@@ -1140,7 +1142,7 @@ Hipace::Wait (const int step, int it, bool only_ghost)
         }
 
         amrex::Gpu::streamSynchronize();
-        amrex::The_Pinned_Arena()->free(recv_buffer);
+        m_comms_arena->free(recv_buffer);
     }
 
     // Receive laser
@@ -1164,7 +1166,7 @@ Hipace::Wait (const int step, int it, bool only_ghost)
                      (m_rank_z+1)%m_numprocs_z, lcomm_z_tag, m_comm_z, &lstatus);
         } else {
             // Receive envelope in a host buffer, and copy to laser fab on device
-            auto lrecv_buffer = (amrex::Real*)amrex::The_Pinned_Arena()->alloc
+            auto lrecv_buffer = (amrex::Real*)m_comms_arena->alloc
                 (sizeof(amrex::Real)*nreals);
             MPI_Recv(lrecv_buffer, nreals,
                      amrex::ParallelDescriptor::Mpi_typemap<amrex::Real>::type(),
@@ -1178,7 +1180,7 @@ Hipace::Wait (const int step, int it, bool only_ghost)
                      laser_arr(i,j,k,n) = buf(i,j,k,n);
                  });
             amrex::Gpu::streamSynchronize();
-            amrex::The_Pinned_Arena()->free(lrecv_buffer);
+            m_comms_arena->free(lrecv_buffer);
         }
     }
 #endif
@@ -1260,7 +1262,7 @@ Hipace::Notify (const int step, const int it, bool only_ghost)
                                   + sizeof(int)*BeamParticleContainer::NAI;
         const amrex::Long buffer_size = psize*np_total;
         char*& psend_buffer = only_ghost ? m_psend_buffer_ghost : m_psend_buffer;
-        psend_buffer = (char*)amrex::The_Pinned_Arena()->alloc(buffer_size);
+        psend_buffer = (char*)m_comms_arena->alloc(buffer_size);
 
         int offset_beam = 0;
         for (int ibeam = 0; ibeam < nbeams; ibeam++){
@@ -1354,7 +1356,7 @@ Hipace::Notify (const int step, const int it, bool only_ghost)
         amrex::Array4<amrex::Real const> const& laser_arr = laser_fab.array();
         const amrex::Box& lbx = laser_fab.box(); // does not include ghost cells
         const std::size_t nreals = lbx.numPts()*laser_fab.nComp();
-        m_lsend_buffer = (amrex::Real*)amrex::The_Pinned_Arena()->alloc
+        m_lsend_buffer = (amrex::Real*)m_comms_arena->alloc
             (sizeof(amrex::Real)*nreals);
         if (m_multi_laser.is3dOnHost()) {
             amrex::Gpu::streamSynchronize();
@@ -1391,7 +1393,7 @@ Hipace::NotifyFinish (const int it, bool only_ghost, bool only_time)
         if (m_psend_buffer_ghost) {
             MPI_Status status;
             MPI_Wait(&m_psend_request_ghost, &status);
-            amrex::The_Pinned_Arena()->free(m_psend_buffer_ghost);
+            m_comms_arena->free(m_psend_buffer_ghost);
             m_psend_buffer_ghost = nullptr;
         }
     } else {
@@ -1412,13 +1414,13 @@ Hipace::NotifyFinish (const int it, bool only_ghost, bool only_time)
         if (m_psend_buffer) {
             MPI_Status status;
             MPI_Wait(&m_psend_request, &status);
-            amrex::The_Pinned_Arena()->free(m_psend_buffer);
+            m_comms_arena->free(m_psend_buffer);
             m_psend_buffer = nullptr;
         }
         if (m_lsend_buffer) {
             MPI_Status status;
             MPI_Wait(&m_lsend_request, &status);
-            amrex::The_Pinned_Arena()->free(m_lsend_buffer);
+            m_comms_arena->free(m_lsend_buffer);
             m_lsend_buffer = nullptr;
         }
     }

From 600db593662816d87efa807422940f4481a440d3 Mon Sep 17 00:00:00 2001
From: AlexanderSinn <64009254+AlexanderSinn@users.noreply.github.com>
Date: Wed, 14 Jun 2023 19:51:14 +0200
Subject: [PATCH 2/6] remove m_ for doc

---
 docs/source/run/parameters.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst
index ca470701fe..9bd89fbc8c 100644
--- a/docs/source/run/parameters.rst
+++ b/docs/source/run/parameters.rst
@@ -193,7 +193,7 @@ General parameters
     Currently, this option only affects plasma operations (gather, push and deposition).
     The tile size can be set with ``plasmas.sort_bin_size``.
 
-* ``hipace.m_comms_buffer_on_gpu`` (`bool`) optional (default `false`)
+* ``hipace.comms_buffer_on_gpu`` (`bool`) optional (default `false`)
     If the buffers used for MPI communication should be allocated on the GPU (device memory).
     By default they will be allocated on the CPU (pinned memory).
     Setting this option to true is necessary to take advatige of GPU-Enabled MPI, however for this

From 592e219fa700247c9e0096dc4da499721328531e Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Thu, 15 Jun 2023 13:01:58 +0200
Subject: [PATCH 3/6] Whether

---
 docs/source/run/parameters.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst
index 9bd89fbc8c..5ab4b72cbe 100644
--- a/docs/source/run/parameters.rst
+++ b/docs/source/run/parameters.rst
@@ -194,7 +194,7 @@ General parameters
     The tile size can be set with ``plasmas.sort_bin_size``.
 
 * ``hipace.comms_buffer_on_gpu`` (`bool`) optional (default `false`)
-    If the buffers used for MPI communication should be allocated on the GPU (device memory).
+    Whether the buffers used for MPI communication should be allocated on the GPU (device memory).
     By default they will be allocated on the CPU (pinned memory).
     Setting this option to true is necessary to take advatige of GPU-Enabled MPI, however for this
     additional enviroment variables need to be set depending on the system.

From bc17d7d216d0af6f28b320fc8e344bea5b26079b Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Thu, 15 Jun 2023 18:31:58 +0200
Subject: [PATCH 4/6] add MPI_Iprobe

---
 src/Hipace.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/Hipace.cpp b/src/Hipace.cpp
index 11c29ed3a6..866f495468 100644
--- a/src/Hipace.cpp
+++ b/src/Hipace.cpp
@@ -496,6 +496,14 @@ Hipace::Evolve ()
 void
 Hipace::SolveOneSlice (int islice, const int islice_local, int step)
 {
+#ifdef AMREX_USE_MPI
+    {
+        // Call a MPI function so that the MPI implementation has a chance to
+        // run tasks necessary to make progress with asynchronous communications.
+        int flag = 0;
+        MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag, MPI_STATUS_IGNORE);
+    }
+#endif
     HIPACE_PROFILE("Hipace::SolveOneSlice()");
 
     // Between this push and the corresponding pop at the end of this

From 374c4f5f97afdb6a35b235aa686ff07eca8f368c Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Mon, 19 Jun 2023 20:24:46 +0200
Subject: [PATCH 5/6] add suggestions

---
 docs/source/run/parameters.rst | 4 ++--
 src/Hipace.H                   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst
index 5ab4b72cbe..725cf22087 100644
--- a/docs/source/run/parameters.rst
+++ b/docs/source/run/parameters.rst
@@ -193,10 +193,10 @@ General parameters
     Currently, this option only affects plasma operations (gather, push and deposition).
     The tile size can be set with ``plasmas.sort_bin_size``.
 
-* ``hipace.comms_buffer_on_gpu`` (`bool`) optional (default `false`)
+* ``hipace.comms_buffer_on_gpu`` (`bool`) optional (default `0`)
     Whether the buffers used for MPI communication should be allocated on the GPU (device memory).
     By default they will be allocated on the CPU (pinned memory).
-    Setting this option to true is necessary to take advatige of GPU-Enabled MPI, however for this
+    Setting this option to `1` is necessary to take advantge of GPU-Enabled MPI, however for this
     additional enviroment variables need to be set depending on the system.
 
 * ``hipace.do_beam_jz_minus_rho`` (`bool`) optional (default `0`)
diff --git a/src/Hipace.H b/src/Hipace.H
index 85dc0b89a1..2d732dae92 100644
--- a/src/Hipace.H
+++ b/src/Hipace.H
@@ -372,7 +372,7 @@ public:
     amrex::Parser m_salame_parser;
     /** Function to get the target Ez field for SALAME */
     amrex::ParserExecutor<3> m_salame_target_func;
-    /** If MPI communication buffers should be allocated in device memory */
+    /** Whether MPI communication buffers should be allocated in device memory */
     bool m_comms_buffer_on_gpu = false;
     /** Arena for MPI communications */
     amrex::Arena* m_comms_arena = nullptr;

From 88d7f200053c93b6a73fa35046e6cd11d1652f58 Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Mon, 19 Jun 2023 20:59:06 +0200
Subject: [PATCH 6/6] a

---
 docs/source/run/parameters.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/run/parameters.rst b/docs/source/run/parameters.rst
index 725cf22087..a5e1287f2f 100644
--- a/docs/source/run/parameters.rst
+++ b/docs/source/run/parameters.rst
@@ -196,7 +196,7 @@ General parameters
 * ``hipace.comms_buffer_on_gpu`` (`bool`) optional (default `0`)
     Whether the buffers used for MPI communication should be allocated on the GPU (device memory).
     By default they will be allocated on the CPU (pinned memory).
-    Setting this option to `1` is necessary to take advantge of GPU-Enabled MPI, however for this
+    Setting this option to `1` is necessary to take advantage of GPU-Enabled MPI, however for this
     additional enviroment variables need to be set depending on the system.
 
 * ``hipace.do_beam_jz_minus_rho`` (`bool`) optional (default `0`)