Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osc/rdma: Fix some bugs running with btl/tcp. #8719

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 9 additions & 14 deletions ompi/mca/osc/rdma/osc_rdma_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -581,7 +581,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
int my_rank = ompi_comm_rank (module->comm);
int global_size = ompi_comm_size (module->comm);
ompi_osc_rdma_region_t *state_region;
struct _local_data *temp;
struct _local_data *temp = NULL;
char *data_file;
int page_size = opal_getpagesize();

Expand Down Expand Up @@ -624,13 +624,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
size += OPAL_ALIGN_PAD_AMOUNT(size, page_size);
}

do {
temp = calloc (local_size, sizeof (temp[0]));
if (NULL == temp) {
ret = OMPI_ERR_OUT_OF_RESOURCE;
break;
}
temp = calloc (local_size, sizeof (temp[0]));
if (NULL == temp) {
return OMPI_ERR_OUT_OF_RESOURCE;
}

do {
temp[local_rank].rank = my_rank;
temp[local_rank].size = size;

Expand Down Expand Up @@ -788,10 +787,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
peer->state_handle = (mca_btl_base_registration_handle_t *) state_region->btl_handle_data;
}
peer->state = (osc_rdma_counter_t) ((uintptr_t) state_region->base + state_base + module->state_size * i);
if (i > 0) {
peer->state_endpoint = local_leader->state_endpoint;
peer->state_btl_index = local_leader->state_btl_index;
}
peer->state_endpoint = local_leader->data_endpoint; // data_endpoint initialized in ompi_osc_rdma_new_peer();
peer->state_btl_index = local_leader->data_btl_index;
}

if (my_rank == peer_rank) {
Expand Down Expand Up @@ -914,10 +911,8 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
{
mca_btl_base_selected_module_t *item;
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
int btls_found = 0;

btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
if (NULL == btls_to_use) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names);
return OMPI_ERR_UNREACH;
Expand Down
10 changes: 3 additions & 7 deletions ompi/mca/osc/rdma/osc_rdma_lock.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,10 @@ static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t
if (OPAL_SUCCESS != ret) {
if (OPAL_LIKELY(1 == ret)) {
*result = ((int64_t *) pending_op->op_buffer)[0];
ret = OMPI_SUCCESS;
ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer,
pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS);
} else {
/* need to release here because ompi_osc_rdma_atomic_complete was not called */
OBJ_RELEASE(pending_op);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch.

NULL, (void *) pending_op, NULL, OPAL_SUCCESS);
// ompi_osc_rdma_atomic_complete() free's pending_op.
return OMPI_SUCCESS;
}
} else if (wait_for_completion) {
while (!pending_op->op_complete) {
Expand Down Expand Up @@ -227,8 +225,6 @@ static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8
ret = OMPI_SUCCESS;
}

/* need to release here because ompi_osc_rdma_atomic_complete was not called */
OBJ_RELEASE(pending_op);
} else {
while (!pending_op->op_complete) {
ompi_osc_rdma_progress (module);
Expand Down