From 6381658b1f2ca9c7b9810ea20f9e0232e1b8dc60 Mon Sep 17 00:00:00 2001 From: Nathan Hjelm Date: Wed, 21 Jun 2017 12:42:50 -0600 Subject: [PATCH] osc/rdma: rework locking code to improve behavior of unlock This commit changes the locking code to allow the lock release to be non-blocking. This helps with releasing the accumulate lock which may occur in a BTL callback. Fixes #3616 Signed-off-by: Nathan Hjelm (cherry picked from commit 022c658bbf36f7ad1ab09caed7623e5d9f9e6723) Signed-off-by: Nathan Hjelm --- ompi/mca/osc/rdma/osc_rdma_active_target.c | 125 ++++++++------------ ompi/mca/osc/rdma/osc_rdma_lock.h | 128 +++++++++++++-------- ompi/mca/osc/rdma/osc_rdma_types.h | 13 +++ 3 files changed, 136 insertions(+), 130 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_active_target.c b/ompi/mca/osc/rdma/osc_rdma_active_target.c index 11338489cb8..043ab93c4b5 100644 --- a/ompi/mca/osc/rdma/osc_rdma_active_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c @@ -45,6 +45,27 @@ typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t; static OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL); +static void ompi_osc_rdma_pending_op_construct (ompi_osc_rdma_pending_op_t *pending_op) +{ + pending_op->op_frag = NULL; + pending_op->op_buffer = NULL; + pending_op->op_result = NULL; + pending_op->op_complete = false; +} + +static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pending_op) +{ + if (NULL != pending_op->op_frag) { + ompi_osc_rdma_frag_complete (pending_op->op_frag); + } + + ompi_osc_rdma_pending_op_construct (pending_op); +} + +OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t, + ompi_osc_rdma_pending_op_construct, + ompi_osc_rdma_pending_op_destruct); + /** * Dummy completion function for atomic operations */ @@ -52,11 +73,19 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b void *local_address, mca_btl_base_registration_handle_t *local_handle, void *context, void *data, int status) { - volatile bool *atomic_complete = (volatile bool *) context; + ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; - if (atomic_complete) { - *atomic_complete = true; + if (pending_op->op_result) { + memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); } + + if (NULL != pending_op->op_frag) { + ompi_osc_rdma_frag_complete (pending_op->op_frag); + pending_op->op_frag = NULL; + } + + pending_op->op_complete = true; + OBJ_RELEASE(pending_op); } /** @@ -179,9 +208,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) ompi_osc_rdma_peer_t **peers; int my_rank = ompi_comm_rank (module->comm); ompi_osc_rdma_state_t *state = module->state; - volatile bool atomic_complete; - ompi_osc_rdma_frag_t *frag; - osc_rdma_counter_t *temp; int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post: %p, %d, %s", (void*) group, assert, win->w_name); @@ -209,9 +235,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) state->num_complete_msgs = 0; OPAL_THREAD_UNLOCK(&module->lock); - /* allocate a temporary buffer for atomic response */ - ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp); - if ((assert & MPI_MODE_NOCHECK) || 0 == ompi_group_size (group)) { return OMPI_SUCCESS; } @@ -223,7 +246,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) /* translate group ranks into the communicator */ peers = ompi_osc_rdma_get_peers (module, module->pw_group); if (OPAL_UNLIKELY(NULL == peers)) { - ompi_osc_rdma_frag_complete (frag); return OMPI_ERR_OUT_OF_RESOURCE; } @@ -233,7 +255,7 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) { ompi_osc_rdma_peer_t *peer = peers[i]; uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index); - int post_index; + ompi_osc_rdma_lock_t post_index; if (peer->rank == my_rank) { ompi_osc_rdma_handle_post (module, my_rank, NULL, 0); @@ -241,57 +263,32 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) } /* get a post index */ - atomic_complete = false; if (!ompi_osc_rdma_peer_local_state (peer)) { - do { - ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, target, frag->handle, - peer->state_handle, MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL); - assert (OPAL_SUCCESS >= ret); - - if (OMPI_SUCCESS == ret) { - while (!atomic_complete) { - ompi_osc_rdma_progress (module); - } - - break; - } - - ompi_osc_rdma_progress (module); - } while (1); + ret = ompi_osc_rdma_lock_btl_fop (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, &post_index, true); + assert (OMPI_SUCCESS == ret); } else { - *temp = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1; + post_index = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1; } - post_index = (*temp) & (OMPI_OSC_RDMA_POST_PEER_MAX - 1); + + post_index &= OMPI_OSC_RDMA_POST_PEER_MAX - 1; target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) + sizeof (osc_rdma_counter_t) * post_index; do { + ompi_osc_rdma_lock_t result; + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", post_index, peer->rank); /* try to post. if the value isn't 0 then another rank is occupying this index */ if (!ompi_osc_rdma_peer_local_state (peer)) { - atomic_complete = false; - ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, target, frag->handle, peer->state_handle, - 0, 1 + (int64_t) my_rank, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) &atomic_complete, NULL); - assert (OPAL_SUCCESS >= ret); - - if (OMPI_SUCCESS == ret) { - while (!atomic_complete) { - ompi_osc_rdma_progress (module); - } - } else { - ompi_osc_rdma_progress (module); - continue; - } - + ret = ompi_osc_rdma_lock_btl_cswap (module, peer, target, 0, 1 + (int64_t) my_rank, &result); + assert (OMPI_SUCCESS == ret); } else { - *temp = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank); + result = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank); } - if (OPAL_LIKELY(0 == *temp)) { + if (OPAL_LIKELY(0 == result)) { break; } @@ -310,8 +307,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win) } while (1); } - ompi_osc_rdma_frag_complete (frag); - ompi_osc_rdma_release_peers (peers, ompi_group_size(module->pw_group)); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post complete"); @@ -419,9 +414,7 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_sync_t *sync = &module->all_sync; - ompi_osc_rdma_frag_t *frag = NULL; ompi_osc_rdma_peer_t **peers; - void *scratch_lock = NULL; ompi_group_t *group; int group_size, ret; @@ -456,45 +449,19 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win) ompi_osc_rdma_sync_rdma_complete (sync); - if (!(MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags)) { - /* need a temporary buffer for performing fetching atomics */ - ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &scratch_lock); - if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { - return ret; - } - } - /* for each process in the group increment their number of complete messages */ for (int i = 0 ; i < group_size ; ++i) { ompi_osc_rdma_peer_t *peer = peers[i]; intptr_t target = (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, num_complete_msgs); if (!ompi_osc_rdma_peer_local_state (peer)) { - do { - if (MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags) { - ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, target, peer->state_handle, - MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_atomic_complete, NULL, NULL); - } else { - /* don't care about the read value so use the scratch lock */ - ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, scratch_lock, - target, frag->handle, peer->state_handle, MCA_BTL_ATOMIC_ADD, 1, - 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, NULL, NULL); - } - - if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { - break; - } - } while (1); + ret = ompi_osc_rdma_lock_btl_op (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, true); + assert (OMPI_SUCCESS == ret); } else { (void) ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) target, 1); } } - if (frag) { - ompi_osc_rdma_frag_complete (frag); - } - /* release our reference to peers in this group */ ompi_osc_rdma_release_peers (peers, group_size); diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 5583711ef28..59774108f99 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -34,23 +34,34 @@ void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_b __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, - int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result) + int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result, + const bool wait_for_completion) { - volatile bool atomic_complete = false; - ompi_osc_rdma_frag_t *frag = NULL; - ompi_osc_rdma_lock_t *temp = NULL; + ompi_osc_rdma_pending_op_t *pending_op; int ret; + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + + if (wait_for_completion) { + OBJ_RETAIN(pending_op); + } + + pending_op->op_result = (void *) result; + pending_op->op_size = sizeof (ompi_osc_rdma_lock_t); + OBJ_RETAIN(pending_op); + /* spin until the btl has accepted the operation */ do { - if (NULL == frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp); + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); } - if (NULL != frag) { - ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, (intptr_t) address, - frag->handle, peer->state_handle, op, operand, 0, - MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, - NULL); + + if (NULL != pending_op->op_frag) { + ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, pending_op->op_buffer, + (intptr_t) address, pending_op->op_frag->handle, peer->state_handle, + op, operand, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); } if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { @@ -59,40 +70,43 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om ompi_osc_rdma_progress (module); } while (1); - if (OPAL_SUCCESS == ret) { - while (!atomic_complete) { + if (OPAL_SUCCESS != ret) { + /* need to release here because ompi_osc_rdma_atomic_complet was not called */ + OBJ_RELEASE(pending_op); + if (OPAL_LIKELY(1 == ret)) { + ret = OMPI_SUCCESS; + } + } else if (wait_for_completion) { + while (!pending_op->op_complete) { ompi_osc_rdma_progress (module); } - } else if (1 == ret) { - ret = OMPI_SUCCESS; } - if (NULL != frag) { - if (result) { - *result = *temp; - } - ompi_osc_rdma_frag_complete (frag); - } + OBJ_RELEASE(pending_op); return ret; } __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, - int op, ompi_osc_rdma_lock_t operand) + int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion) { - volatile bool atomic_complete = false; + ompi_osc_rdma_pending_op_t *pending_op; int ret; if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - return ompi_osc_rdma_lock_btl_fop (module, peer, address, op, operand, NULL); + return ompi_osc_rdma_lock_btl_fop (module, peer, address, op, operand, NULL, wait_for_completion); } + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + OBJ_RETAIN(pending_op); + /* spin until the btl has accepted the operation */ do { ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, (intptr_t) address, peer->state_handle, op, operand, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) &atomic_complete, NULL); + (void *) pending_op, NULL); if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { break; @@ -100,14 +114,20 @@ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, omp ompi_osc_rdma_progress (module); } while (1); - if (OPAL_SUCCESS == ret) { - while (!atomic_complete) { + if (OPAL_SUCCESS != ret) { + /* need to release here because ompi_osc_rdma_atomic_complet was not called */ + OBJ_RELEASE(pending_op); + if (OPAL_LIKELY(1 == ret)) { + ret = OMPI_SUCCESS; + } + } else if (wait_for_completion) { + while (!pending_op->op_complete) { ompi_osc_rdma_progress (module); } - } else if (1 == ret) { - ret = OMPI_SUCCESS; } + OBJ_RELEASE(pending_op); + return ret; } @@ -115,20 +135,26 @@ __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result) { - volatile bool atomic_complete = false; - ompi_osc_rdma_frag_t *frag = NULL; - ompi_osc_rdma_lock_t *temp = NULL; + ompi_osc_rdma_pending_op_t *pending_op; int ret; + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + + OBJ_RETAIN(pending_op); + + pending_op->op_result = (void *) result; + pending_op->op_size = sizeof (*result); + /* spin until the btl has accepted the operation */ do { - if (NULL == frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp); + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); } - if (NULL != frag) { - ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, address, frag->handle, - peer->state_handle, compare, value, 0, 0, ompi_osc_rdma_atomic_complete, - (void *) &atomic_complete, NULL); + if (NULL != pending_op->op_frag) { + ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, pending_op->op_buffer, + address, pending_op->op_frag->handle, peer->state_handle, compare, + value, 0, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL); } if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { @@ -137,20 +163,19 @@ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_progress (module); } while (1); - if (OPAL_SUCCESS == ret) { - while (!atomic_complete) { + if (OPAL_SUCCESS != ret) { + /* need to release here because ompi_osc_rdma_atomic_complet was not called */ + OBJ_RELEASE(pending_op); + if (OPAL_LIKELY(1 == ret)) { + ret = OMPI_SUCCESS; + } + } else { + while (!pending_op->op_complete) { ompi_osc_rdma_progress (module); } - } else if (1 == ret) { - ret = OMPI_SUCCESS; } - if (NULL != frag) { - if (*result) { - *result = *temp; - } - ompi_osc_rdma_frag_complete (frag); - } + OBJ_RELEASE(pending_op); return ret; } @@ -178,7 +203,7 @@ static inline int ompi_osc_rdma_lock_release_shared (ompi_osc_rdma_module_t *mod peer->rank, (unsigned long) value); if (!ompi_osc_rdma_peer_local_state (peer)) { - return ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, value); + return ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, value, false); } (void) ompi_osc_rdma_lock_add ((volatile ompi_osc_rdma_lock_t *) lock, value); @@ -215,7 +240,7 @@ static inline int ompi_osc_rdma_lock_acquire_shared (ompi_osc_rdma_module_t *mod /* spin until the lock has been acquired */ if (!ompi_osc_rdma_peer_local_state (peer)) { do { - ret = ompi_osc_rdma_lock_btl_fop (module, peer, lock, MCA_BTL_ATOMIC_ADD, value, &lock_state); + ret = ompi_osc_rdma_lock_btl_fop (module, peer, lock, MCA_BTL_ATOMIC_ADD, value, &lock_state, true); if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to increment shared lock. opal error code %d", ret); return ret; @@ -339,7 +364,8 @@ static inline int ompi_osc_rdma_lock_release_exclusive (ompi_osc_rdma_module_t * OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "releasing exclusive lock %" PRIx64 " on peer %d", lock, peer->rank); if (!ompi_osc_rdma_peer_local_state (peer)) { - ret = ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE); + ret = ompi_osc_rdma_lock_btl_op (module, peer, lock, MCA_BTL_ATOMIC_ADD, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE, + false); } else { ompi_osc_rdma_unlock_local ((volatile ompi_osc_rdma_lock_t *)(intptr_t) lock); } diff --git a/ompi/mca/osc/rdma/osc_rdma_types.h b/ompi/mca/osc/rdma/osc_rdma_types.h index 123238d0209..1a8403c5361 100644 --- a/ompi/mca/osc/rdma/osc_rdma_types.h +++ b/ompi/mca/osc/rdma/osc_rdma_types.h @@ -205,6 +205,19 @@ typedef struct ompi_osc_rdma_aggregation_t ompi_osc_rdma_aggregation_t; OBJ_CLASS_DECLARATION(ompi_osc_rdma_aggregation_t); +struct ompi_osc_rdma_pending_op_t { + opal_list_item_t super; + struct ompi_osc_rdma_frag_t *op_frag; + void *op_buffer; + void *op_result; + size_t op_size; + volatile bool op_complete; +}; + +typedef struct ompi_osc_rdma_pending_op_t ompi_osc_rdma_pending_op_t; + +OBJ_CLASS_DECLARATION(ompi_osc_rdma_pending_op_t); + #define OSC_RDMA_VERBOSE(x, ...) OPAL_OUTPUT_VERBOSE((x, ompi_osc_base_framework.framework_output, __VA_ARGS__)) #endif /* OMPI_OSC_RDMA_TYPES_H */