From e13d7c5ac655ab590de240e2d5259ef1c805a2fa Mon Sep 17 00:00:00 2001 From: Elliot Ronaghan Date: Sat, 15 Feb 2020 07:37:48 -0600 Subject: [PATCH 1/2] Serialize calls to gasnet_AMPoll In some configurations (notably gasnet-ibv, especially with EPYC processors) there can be significant contention from concurrent AM polls. Serialize our calls to reduce that contention. This significantly improves performance of concurrent blocking on-stmts/active-messages. For ra-on with 48-core Intel Cascade Lake CPUs we see a ~2x speedup and with 48-core AMD Rome CPUs we see a 55x speedup. This change was motivated by seeing a large performance difference on AMD EPYC processors, but happily it has helped both Intel and AMD chips. For 36-core Broadwell nodes (where our nightly ibv performance testing runs) we see the following improvements: - 85% speedup for SSCA - 75% speedup for ra-on - 35% speedup for ra-atomics - 30% speedup for bale histogram - 20% speedup for fft - 15% speedup for lulesh --- runtime/src/comm/gasnet/comm-gasnet.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/runtime/src/comm/gasnet/comm-gasnet.c b/runtime/src/comm/gasnet/comm-gasnet.c index 9279077a2bc3..a2a1ed83e1d0 100644 --- a/runtime/src/comm/gasnet/comm-gasnet.c +++ b/runtime/src/comm/gasnet/comm-gasnet.c @@ -151,6 +151,8 @@ void init_done_obj(done_t* done, int target) { done->flag = 0; } +static inline void am_poll_try(void); + static inline void wait_done_obj(done_t* done, chpl_bool do_yield) { @@ -158,7 +160,7 @@ void wait_done_obj(done_t* done, chpl_bool do_yield) GASNET_BLOCKUNTIL(done->flag); #else while (!done->flag) { - (void) gasnet_AMPoll(); + am_poll_try(); if (do_yield) chpl_task_yield(); } @@ -720,12 +722,23 @@ int32_t chpl_comm_getMaxThreads(void) { static volatile int pollingRunning; static volatile int pollingQuit; static chpl_bool pollingRequired; +static atomic_bool pollingLock; + +static inline void am_poll_try(void) { + // Serialize access to polling. Concurrent polling can have serious + // contention issues in some configurations (particularly ibv) + if (!atomic_load_explicit_bool(&pollingLock, memory_order_acquire) && + !atomic_exchange_explicit_bool(&pollingLock, true, memory_order_acquire)) { + (void) gasnet_AMPoll(); + atomic_store_explicit_bool(&pollingLock, false, memory_order_release); + } +} static void polling(void* x) { pollingRunning = 1; while (!pollingQuit) { - (void) gasnet_AMPoll(); + am_poll_try(); chpl_task_yield(); } @@ -733,6 +746,7 @@ static void polling(void* x) { } static void setup_polling(void) { + atomic_init_bool(&pollingLock, false); #if defined(GASNET_CONDUIT_IBV) pollingRequired = false; chpl_env_set("GASNET_RCV_THREAD", "1", 1); From 0279d61c30c67cc335fe6426c4cbabde97414dd9 Mon Sep 17 00:00:00 2001 From: Elliot Ronaghan Date: Tue, 18 Feb 2020 09:21:38 -0500 Subject: [PATCH 2/2] Only serialize polling for IBV/Aries The ibv and aries substrates use RDMA for PUTs/GETs, but udp (amudp) and mpi (ammpi) use active messages for most operations. Serializing for udp hurt performance and mpi already serializes AMs internally so there's no need for the extra serialization. --- runtime/src/comm/gasnet/comm-gasnet.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/runtime/src/comm/gasnet/comm-gasnet.c b/runtime/src/comm/gasnet/comm-gasnet.c index a2a1ed83e1d0..74763647b162 100644 --- a/runtime/src/comm/gasnet/comm-gasnet.c +++ b/runtime/src/comm/gasnet/comm-gasnet.c @@ -725,13 +725,18 @@ static chpl_bool pollingRequired; static atomic_bool pollingLock; static inline void am_poll_try(void) { - // Serialize access to polling. Concurrent polling can have serious - // contention issues in some configurations (particularly ibv) + // Serialize polling for IBV and Aries. Concurrent polling causes contention + // in these configurations. For other configurations that are AM-based + // (udp/amudp, mpi/ammpi) serializing can hurt performance. +#if defined(GASNET_CONDUIT_IBV) || defined(GASNET_CONDUIT_ARIES) if (!atomic_load_explicit_bool(&pollingLock, memory_order_acquire) && !atomic_exchange_explicit_bool(&pollingLock, true, memory_order_acquire)) { (void) gasnet_AMPoll(); atomic_store_explicit_bool(&pollingLock, false, memory_order_release); } +#else + (void) gasnet_AMPoll(); +#endif } static void polling(void* x) {