Skip to content

Commit

Permalink
osc/rdma: rework locking code to improve behavior of unlock
Browse files Browse the repository at this point in the history
This commit changes the locking code to allow the lock release to be
non-blocking. This helps with releasing the accumulate lock which may
occur in a BTL callback.

Fixes open-mpi#3616

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
(cherry picked from commit 022c658)
Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
  • Loading branch information
hjelmn committed Jun 28, 2017
1 parent 49e1878 commit 6381658
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 130 deletions.
125 changes: 46 additions & 79 deletions ompi/mca/osc/rdma/osc_rdma_active_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,18 +45,47 @@ typedef struct ompi_osc_rdma_pending_post_t ompi_osc_rdma_pending_post_t;

static OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_post_t, opal_list_item_t, NULL, NULL);

static void ompi_osc_rdma_pending_op_construct (ompi_osc_rdma_pending_op_t *pending_op)
{
pending_op->op_frag = NULL;
pending_op->op_buffer = NULL;
pending_op->op_result = NULL;
pending_op->op_complete = false;
}

static void ompi_osc_rdma_pending_op_destruct (ompi_osc_rdma_pending_op_t *pending_op)
{
if (NULL != pending_op->op_frag) {
ompi_osc_rdma_frag_complete (pending_op->op_frag);
}

ompi_osc_rdma_pending_op_construct (pending_op);
}

OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t,
ompi_osc_rdma_pending_op_construct,
ompi_osc_rdma_pending_op_destruct);

/**
* Dummy completion function for atomic operations
*/
void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, mca_btl_base_registration_handle_t *local_handle,
void *context, void *data, int status)
{
volatile bool *atomic_complete = (volatile bool *) context;
ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context;

if (atomic_complete) {
*atomic_complete = true;
if (pending_op->op_result) {
memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size);
}

if (NULL != pending_op->op_frag) {
ompi_osc_rdma_frag_complete (pending_op->op_frag);
pending_op->op_frag = NULL;
}

pending_op->op_complete = true;
OBJ_RELEASE(pending_op);
}

/**
Expand Down Expand Up @@ -179,9 +208,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
ompi_osc_rdma_peer_t **peers;
int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_state_t *state = module->state;
volatile bool atomic_complete;
ompi_osc_rdma_frag_t *frag;
osc_rdma_counter_t *temp;
int ret;

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post: %p, %d, %s", (void*) group, assert, win->w_name);
Expand Down Expand Up @@ -209,9 +235,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
state->num_complete_msgs = 0;
OPAL_THREAD_UNLOCK(&module->lock);

/* allocate a temporary buffer for atomic response */
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &temp);

if ((assert & MPI_MODE_NOCHECK) || 0 == ompi_group_size (group)) {
return OMPI_SUCCESS;
}
Expand All @@ -223,7 +246,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
/* translate group ranks into the communicator */
peers = ompi_osc_rdma_get_peers (module, module->pw_group);
if (OPAL_UNLIKELY(NULL == peers)) {
ompi_osc_rdma_frag_complete (frag);
return OMPI_ERR_OUT_OF_RESOURCE;
}

Expand All @@ -233,65 +255,40 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
for (int i = 0 ; i < ompi_group_size(module->pw_group) ; ++i) {
ompi_osc_rdma_peer_t *peer = peers[i];
uint64_t target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_index);
int post_index;
ompi_osc_rdma_lock_t post_index;

if (peer->rank == my_rank) {
ompi_osc_rdma_handle_post (module, my_rank, NULL, 0);
continue;
}

/* get a post index */
atomic_complete = false;
if (!ompi_osc_rdma_peer_local_state (peer)) {
do {
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, temp, target, frag->handle,
peer->state_handle, MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_atomic_complete, (void *) &atomic_complete, NULL);
assert (OPAL_SUCCESS >= ret);

if (OMPI_SUCCESS == ret) {
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}

break;
}

ompi_osc_rdma_progress (module);
} while (1);
ret = ompi_osc_rdma_lock_btl_fop (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, &post_index, true);
assert (OMPI_SUCCESS == ret);
} else {
*temp = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
post_index = ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) (intptr_t) target, 1) - 1;
}
post_index = (*temp) & (OMPI_OSC_RDMA_POST_PEER_MAX - 1);

post_index &= OMPI_OSC_RDMA_POST_PEER_MAX - 1;

target = (uint64_t) (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, post_peers) +
sizeof (osc_rdma_counter_t) * post_index;

do {
ompi_osc_rdma_lock_t result;

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attempting to post to index %d @ rank %d", post_index, peer->rank);

/* try to post. if the value isn't 0 then another rank is occupying this index */
if (!ompi_osc_rdma_peer_local_state (peer)) {
atomic_complete = false;
ret = module->selected_btl->btl_atomic_cswap (module->selected_btl, peer->state_endpoint, temp, target, frag->handle, peer->state_handle,
0, 1 + (int64_t) my_rank, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete,
(void *) &atomic_complete, NULL);
assert (OPAL_SUCCESS >= ret);

if (OMPI_SUCCESS == ret) {
while (!atomic_complete) {
ompi_osc_rdma_progress (module);
}
} else {
ompi_osc_rdma_progress (module);
continue;
}

ret = ompi_osc_rdma_lock_btl_cswap (module, peer, target, 0, 1 + (int64_t) my_rank, &result);
assert (OMPI_SUCCESS == ret);
} else {
*temp = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank);
result = !ompi_osc_rdma_lock_cmpset ((osc_rdma_counter_t *) target, 0, 1 + (osc_rdma_counter_t) my_rank);
}

if (OPAL_LIKELY(0 == *temp)) {
if (OPAL_LIKELY(0 == result)) {
break;
}

Expand All @@ -310,8 +307,6 @@ int ompi_osc_rdma_post_atomic (ompi_group_t *group, int assert, ompi_win_t *win)
} while (1);
}

ompi_osc_rdma_frag_complete (frag);

ompi_osc_rdma_release_peers (peers, ompi_group_size(module->pw_group));

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "post complete");
Expand Down Expand Up @@ -419,9 +414,7 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win)
{
ompi_osc_rdma_module_t *module = GET_MODULE(win);
ompi_osc_rdma_sync_t *sync = &module->all_sync;
ompi_osc_rdma_frag_t *frag = NULL;
ompi_osc_rdma_peer_t **peers;
void *scratch_lock = NULL;
ompi_group_t *group;
int group_size, ret;

Expand Down Expand Up @@ -456,45 +449,19 @@ int ompi_osc_rdma_complete_atomic (ompi_win_t *win)

ompi_osc_rdma_sync_rdma_complete (sync);

if (!(MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags)) {
/* need a temporary buffer for performing fetching atomics */
ret = ompi_osc_rdma_frag_alloc (module, 8, &frag, (char **) &scratch_lock);
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
return ret;
}
}

/* for each process in the group increment their number of complete messages */
for (int i = 0 ; i < group_size ; ++i) {
ompi_osc_rdma_peer_t *peer = peers[i];
intptr_t target = (intptr_t) peer->state + offsetof (ompi_osc_rdma_state_t, num_complete_msgs);

if (!ompi_osc_rdma_peer_local_state (peer)) {
do {
if (MCA_BTL_FLAGS_ATOMIC_OPS & module->selected_btl->btl_flags) {
ret = module->selected_btl->btl_atomic_op (module->selected_btl, peer->state_endpoint, target, peer->state_handle,
MCA_BTL_ATOMIC_ADD, 1, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_atomic_complete, NULL, NULL);
} else {
/* don't care about the read value so use the scratch lock */
ret = module->selected_btl->btl_atomic_fop (module->selected_btl, peer->state_endpoint, scratch_lock,
target, frag->handle, peer->state_handle, MCA_BTL_ATOMIC_ADD, 1,
0, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, NULL, NULL);
}

if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
break;
}
} while (1);
ret = ompi_osc_rdma_lock_btl_op (module, peer, target, MCA_BTL_ATOMIC_ADD, 1, true);
assert (OMPI_SUCCESS == ret);
} else {
(void) ompi_osc_rdma_counter_add ((osc_rdma_counter_t *) target, 1);
}
}

if (frag) {
ompi_osc_rdma_frag_complete (frag);
}

/* release our reference to peers in this group */
ompi_osc_rdma_release_peers (peers, group_size);

Expand Down
Loading

0 comments on commit 6381658

Please sign in to comment.