From 2e8e0808212d0cac98d8ee20cbad5608b0847973 Mon Sep 17 00:00:00 2001 From: Erica Mitchell Date: Thu, 12 Sep 2024 16:24:11 -0400 Subject: [PATCH 1/6] decrease buffer length and only form if child processes are present --- src/madness/world/worldgop.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/madness/world/worldgop.h b/src/madness/world/worldgop.h index 34817a755c5..97080693788 100644 --- a/src/madness/world/worldgop.h +++ b/src/madness/world/worldgop.h @@ -785,8 +785,12 @@ namespace madness { world_.mpi.binary_tree_info(0, parent, child0, child1); const std::size_t nelem_per_maxmsg = max_reducebcast_msg_size() / sizeof(T); - auto buf0 = std::unique_ptr(new T[nelem_per_maxmsg]); - auto buf1 = std::unique_ptr(new T[nelem_per_maxmsg]); + std::unique_ptr buf0 = (child0 != -1) + ? std::unique_ptr(new T[std::min(nelem_per_maxmsg,nelem)]) + : nullptr; + std::unique_ptr buf1 = (child1 != -1) + ? std::unique_ptr(new T[std::min(nelem_per_maxmsg,nelem)]) + : nullptr; auto reduce_impl = [&,this](T* buf, size_t nelem) { MADNESS_ASSERT(nelem <= nelem_per_maxmsg); From bfe28e369853b029b7abf4e9f086bf120e51f00e Mon Sep 17 00:00:00 2001 From: Erica Mitchell Date: Fri, 13 Sep 2024 20:14:39 -0400 Subject: [PATCH 2/6] Use std:malloc to avoid initialization --- src/madness/world/worldgop.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/madness/world/worldgop.h b/src/madness/world/worldgop.h index 97080693788..f8b86727b65 100644 --- a/src/madness/world/worldgop.h +++ b/src/madness/world/worldgop.h @@ -785,28 +785,28 @@ namespace madness { world_.mpi.binary_tree_info(0, parent, child0, child1); const std::size_t nelem_per_maxmsg = max_reducebcast_msg_size() / sizeof(T); - std::unique_ptr buf0 = (child0 != -1) - ? std::unique_ptr(new T[std::min(nelem_per_maxmsg,nelem)]) - : nullptr; - std::unique_ptr buf1 = (child1 != -1) - ? std::unique_ptr(new T[std::min(nelem_per_maxmsg,nelem)]) - : nullptr; + auto buf0 = (child0 != -1) + ? static_cast(std::malloc(sizeof(T) * std::min(nelem_per_maxmsg, nelem))) + : nullptr; + auto buf1 = (child1 != -1) + ? static_cast(std::malloc(sizeof(T) * std::min(nelem_per_maxmsg, nelem))) + : nullptr; auto reduce_impl = [&,this](T* buf, size_t nelem) { MADNESS_ASSERT(nelem <= nelem_per_maxmsg); SafeMPI::Request req0, req1; Tag gsum_tag = world_.mpi.unique_tag(); - if (child0 != -1) req0 = world_.mpi.Irecv(buf0.get(), nelem*sizeof(T), MPI_BYTE, child0, gsum_tag); - if (child1 != -1) req1 = world_.mpi.Irecv(buf1.get(), nelem*sizeof(T), MPI_BYTE, child1, gsum_tag); + if (child0 != -1) req0 = world_.mpi.Irecv(buf0, nelem*sizeof(T), MPI_BYTE, child0, gsum_tag); + if (child1 != -1) req1 = world_.mpi.Irecv(buf1, nelem*sizeof(T), MPI_BYTE, child1, gsum_tag); if (child0 != -1) { World::await(req0); - for (long i=0; i<(long)nelem; ++i) buf[i] = op(buf[i],buf0[i]); + for (long i=0; i<(long)nelem; ++i) buf[i] = op(buf[i], buf0[i]); } if (child1 != -1) { World::await(req1); - for (long i=0; i<(long)nelem; ++i) buf[i] = op(buf[i],buf1[i]); + for (long i=0; i<(long)nelem; ++i) buf[i] = op(buf[i], buf1[i]); } if (parent != -1) { @@ -823,6 +823,9 @@ namespace madness { nelem -= n; buf += n; } + + if (child0 != -1) std::free(buf0); + if (child1 != -1) std::free(buf1); } /// Inplace global sum while still processing AM & tasks From c7e2673978583334a2a9b555477642ccdc601764 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 16 Sep 2024 13:23:50 -0400 Subject: [PATCH 3/6] handle alignment requirements correctly (?) in worldgop::{reduce,concat0} --- src/madness/world/worldgop.h | 93 ++++++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/src/madness/world/worldgop.h b/src/madness/world/worldgop.h index f8b86727b65..4bcffe385c7 100644 --- a/src/madness/world/worldgop.h +++ b/src/madness/world/worldgop.h @@ -781,36 +781,78 @@ namespace madness { /// Optimizations can be added for long messages and to reduce the memory footprint template void reduce(T* buf, std::size_t nelem, opT op) { + static_assert(std::is_trivially_constructible_v && + std::is_trivially_copyable_v, + "T must be trivially constructible and copyable"); ProcessID parent, child0, child1; world_.mpi.binary_tree_info(0, parent, child0, child1); - const std::size_t nelem_per_maxmsg = max_reducebcast_msg_size() / sizeof(T); - - auto buf0 = (child0 != -1) - ? static_cast(std::malloc(sizeof(T) * std::min(nelem_per_maxmsg, nelem))) - : nullptr; - auto buf1 = (child1 != -1) - ? static_cast(std::malloc(sizeof(T) * std::min(nelem_per_maxmsg, nelem))) - : nullptr; + const std::size_t nelem_per_maxmsg = + max_reducebcast_msg_size() / sizeof(T); + + const auto buf_size = ((sizeof(T) * std::min(nelem_per_maxmsg, nelem) + + std::alignment_of_v - 1) / + std::alignment_of_v) * std::alignment_of_v; + struct free_dtor { + void operator()(T *ptr) { + if (ptr != nullptr) + std::free(ptr); + }; + }; + using sptr_t = std::unique_ptr; + sptr_t buf0; + auto aligned_buf_alloc = [&]() -> T* { + // posix_memalign requires alignment to be an integer multiple of sizeof(void*)!! so ensure that + const std::size_t alignment = + ((std::alignment_of_v + sizeof(void *) - 1) / + sizeof(void *)) * + sizeof(void *); +#ifdef HAVE_POSIX_MEMALIGN + void *ptr; + if (posix_memalign(&ptr, alignment, buf_size) != 0) { + char errmsg[1024]; + strerror_r(errno, errmsg, sizeof(errmsg)); + std::cerr << "posix_memalign failed: " << errmsg << std::endl; + throw std::bad_alloc(); + } + return static_cast(ptr); +#else + return static_cast(std::aligned_alloc(alignment, buf_size)) +#endif + }; + if (child0 != -1) + buf0 = sptr_t(aligned_buf_alloc(), + free_dtor{}); + sptr_t buf1(nullptr); + if (child1 != -1) + buf1 = sptr_t(aligned_buf_alloc(), + free_dtor{}); auto reduce_impl = [&,this](T* buf, size_t nelem) { MADNESS_ASSERT(nelem <= nelem_per_maxmsg); SafeMPI::Request req0, req1; Tag gsum_tag = world_.mpi.unique_tag(); - if (child0 != -1) req0 = world_.mpi.Irecv(buf0, nelem*sizeof(T), MPI_BYTE, child0, gsum_tag); - if (child1 != -1) req1 = world_.mpi.Irecv(buf1, nelem*sizeof(T), MPI_BYTE, child1, gsum_tag); + if (child0 != -1) + req0 = world_.mpi.Irecv(buf0.get(), nelem * sizeof(T), MPI_BYTE, + child0, gsum_tag); + if (child1 != -1) + req1 = world_.mpi.Irecv(buf1.get(), nelem * sizeof(T), MPI_BYTE, + child1, gsum_tag); if (child0 != -1) { World::await(req0); - for (long i=0; i<(long)nelem; ++i) buf[i] = op(buf[i], buf0[i]); + for (long i = 0; i < (long)nelem; ++i) + buf[i] = op(buf[i], buf0[i]); } if (child1 != -1) { World::await(req1); - for (long i=0; i<(long)nelem; ++i) buf[i] = op(buf[i], buf1[i]); + for (long i = 0; i < (long)nelem; ++i) + buf[i] = op(buf[i], buf1[i]); } if (parent != -1) { - req0 = world_.mpi.Isend(buf, nelem*sizeof(T), MPI_BYTE, parent, gsum_tag); + req0 = world_.mpi.Isend(buf, nelem * sizeof(T), MPI_BYTE, parent, + gsum_tag); World::await(req0); } @@ -823,9 +865,6 @@ namespace madness { nelem -= n; buf += n; } - - if (child0 != -1) std::free(buf0); - if (child1 != -1) std::free(buf1); } /// Inplace global sum while still processing AM & tasks @@ -910,7 +949,7 @@ namespace madness { /// Concatenate an STL vector of serializable stuff onto node 0 /// \param[in] v input vector - /// \param[in] bufsz the max of the result' must be less than std::numeric_limits::max() + /// \param[in] bufsz the max number of bytes in the result; must be less than std::numeric_limits::max() /// \return on rank 0 returns the concatenated vector, elsewhere returns an empty vector template std::vector concat0(const std::vector& v, size_t bufsz=1024*1024) { @@ -920,8 +959,24 @@ namespace madness { world_.mpi.binary_tree_info(0, parent, child0, child1); int child0_nbatch = 0, child1_nbatch = 0; - auto buf0 = std::unique_ptr(new std::byte[bufsz]); - auto buf1 = std::unique_ptr(new std::byte[bufsz]); + struct free_dtor { + void operator()(std::byte *ptr) { + if (ptr != nullptr) + std::free(ptr); + }; + }; + using sptr_t = std::unique_ptr; + sptr_t buf0; + if (child0 != -1) + buf0 = sptr_t(static_cast(std::aligned_alloc( + std::alignment_of_v, bufsz)), + free_dtor{}); + sptr_t buf1; + if (child1 != -1) + buf1 = sptr_t(static_cast(std::aligned_alloc( + std::alignment_of_v, bufsz)), + free_dtor{}); + // transfer data in chunks at most this large const int batch_size = static_cast(std::min(static_cast(max_reducebcast_msg_size()),bufsz)); From 0386b7f5a7b0f30e03bd39e6adcbb80bbd91c5ac Mon Sep 17 00:00:00 2001 From: Erica Mitchell Date: Mon, 16 Sep 2024 16:36:43 -0400 Subject: [PATCH 4/6] Remove assertion and error msg, give type for ptr in concat0 --- src/madness/world/worldgop.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/madness/world/worldgop.h b/src/madness/world/worldgop.h index 4bcffe385c7..b28859b5ddd 100644 --- a/src/madness/world/worldgop.h +++ b/src/madness/world/worldgop.h @@ -781,9 +781,6 @@ namespace madness { /// Optimizations can be added for long messages and to reduce the memory footprint template void reduce(T* buf, std::size_t nelem, opT op) { - static_assert(std::is_trivially_constructible_v && - std::is_trivially_copyable_v, - "T must be trivially constructible and copyable"); ProcessID parent, child0, child1; world_.mpi.binary_tree_info(0, parent, child0, child1); const std::size_t nelem_per_maxmsg = @@ -809,9 +806,6 @@ namespace madness { #ifdef HAVE_POSIX_MEMALIGN void *ptr; if (posix_memalign(&ptr, alignment, buf_size) != 0) { - char errmsg[1024]; - strerror_r(errno, errmsg, sizeof(errmsg)); - std::cerr << "posix_memalign failed: " << errmsg << std::endl; throw std::bad_alloc(); } return static_cast(ptr); @@ -965,15 +959,15 @@ namespace madness { std::free(ptr); }; }; - using sptr_t = std::unique_ptr; + using sptr_t = std::unique_ptr; sptr_t buf0; if (child0 != -1) - buf0 = sptr_t(static_cast(std::aligned_alloc( + buf0 = sptr_t(static_cast(std::aligned_alloc( std::alignment_of_v, bufsz)), free_dtor{}); sptr_t buf1; if (child1 != -1) - buf1 = sptr_t(static_cast(std::aligned_alloc( + buf1 = sptr_t(static_cast(std::aligned_alloc( std::alignment_of_v, bufsz)), free_dtor{}); From 430e153363f689ed8f9b5d6a78c5ed0a9be2b352 Mon Sep 17 00:00:00 2001 From: Erica Mitchell Date: Mon, 16 Sep 2024 17:14:53 -0400 Subject: [PATCH 5/6] [ci skip] semicolon and linewidth --- src/madness/world/worldgop.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/madness/world/worldgop.h b/src/madness/world/worldgop.h index b28859b5ddd..e4843fb0b49 100644 --- a/src/madness/world/worldgop.h +++ b/src/madness/world/worldgop.h @@ -810,7 +810,7 @@ namespace madness { } return static_cast(ptr); #else - return static_cast(std::aligned_alloc(alignment, buf_size)) + return static_cast(std::aligned_alloc(alignment, buf_size)); #endif }; if (child0 != -1) @@ -972,7 +972,8 @@ namespace madness { free_dtor{}); // transfer data in chunks at most this large - const int batch_size = static_cast(std::min(static_cast(max_reducebcast_msg_size()),bufsz)); + const int batch_size = static_cast( + std::min(static_cast(max_reducebcast_msg_size()),bufsz)); // precompute max # of tags any node ... will need, and allocate them on every node to avoid tag counter divergence const int max_nbatch = bufsz / batch_size; From ef73ffd760497e20338de76789ce5ba4db1ef474 Mon Sep 17 00:00:00 2001 From: Erica Mitchell Date: Mon, 16 Sep 2024 17:23:03 -0400 Subject: [PATCH 6/6] Spacing --- src/madness/world/worldgop.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/madness/world/worldgop.h b/src/madness/world/worldgop.h index e4843fb0b49..f1acc7b1452 100644 --- a/src/madness/world/worldgop.h +++ b/src/madness/world/worldgop.h @@ -796,6 +796,7 @@ namespace madness { }; }; using sptr_t = std::unique_ptr; + sptr_t buf0; auto aligned_buf_alloc = [&]() -> T* { // posix_memalign requires alignment to be an integer multiple of sizeof(void*)!! so ensure that @@ -960,6 +961,7 @@ namespace madness { }; }; using sptr_t = std::unique_ptr; + sptr_t buf0; if (child0 != -1) buf0 = sptr_t(static_cast(std::aligned_alloc(