Skip to content

Commit

Permalink
btl/openib: delay UCX warning to add_procs()
Browse files Browse the repository at this point in the history
If UCX is available, then pml/ucx will be used instead of
pml/ob1 + btl/openib, so there is no need to warn about
btl/openib not supporting Infiniband.

Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>

(cherry picked from commit 0a2ce58)
  • Loading branch information
ggouaillardet committed Dec 5, 2018
1 parent 804f65f commit 99d8576
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 127 deletions.
130 changes: 70 additions & 60 deletions opal/mca/btl/openib/btl_openib.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2013-2015 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014 Bull SAS. All rights reserved
* $COPYRIGHT$
*
Expand Down Expand Up @@ -1040,6 +1040,14 @@ int mca_btl_openib_add_procs(
int btl_rank = 0;
volatile mca_btl_base_endpoint_t* endpoint;


if (! openib_btl->allowed) {
opal_bitmap_clear_all_bits(reachable);
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
true, opal_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num);
}

btl_rank = get_openib_btl_params(openib_btl, &lcl_subnet_id_port_cnt);
if( 0 > btl_rank ){
return OPAL_ERR_NOT_FOUND;
Expand Down Expand Up @@ -1639,75 +1647,77 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
return OPAL_SUCCESS;
}

/* Release all QPs */
if (NULL != openib_btl->device->endpoints) {
for (ep_index=0;
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
ep_index++) {
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
if (openib_btl->allowed) {
/* Release all QPs */
if (NULL != openib_btl->device->endpoints) {
for (ep_index=0;
ep_index < opal_pointer_array_get_size(openib_btl->device->endpoints);
ep_index++) {
endpoint=(mca_btl_openib_endpoint_t *)opal_pointer_array_get_item(openib_btl->device->endpoints,
ep_index);
if(!endpoint) {
BTL_VERBOSE(("In finalize, got another null endpoint"));
continue;
}
if(endpoint->endpoint_btl != openib_btl) {
continue;
}
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
openib_btl->device->eager_rdma_buffers[i] = NULL;
OBJ_RELEASE(endpoint);
if(!endpoint) {
BTL_VERBOSE(("In finalize, got another null endpoint"));
continue;
}
if(endpoint->endpoint_btl != openib_btl) {
continue;
}
for(i = 0; i < openib_btl->device->eager_rdma_buffers_count; i++) {
if(openib_btl->device->eager_rdma_buffers[i] == endpoint) {
openib_btl->device->eager_rdma_buffers[i] = NULL;
OBJ_RELEASE(endpoint);
}
}
opal_pointer_array_set_item(openib_btl->device->endpoints,
ep_index, NULL);
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
OBJ_RELEASE(endpoint);
}
opal_pointer_array_set_item(openib_btl->device->endpoints,
ep_index, NULL);
assert(((opal_object_t*)endpoint)->obj_reference_count == 1);
OBJ_RELEASE(endpoint);
}
}

/* Release SRQ resources */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
opal_mutex_t *lock =
&mca_btl_openib_component.srq_manager.lock;

opal_hash_table_t *srq_addr_table =
&mca_btl_openib_component.srq_manager.srq_addr_table;

opal_mutex_lock(lock);
if (OPAL_SUCCESS !=
opal_hash_table_remove_value_ptr(srq_addr_table,
&openib_btl->qps[qp].u.srq_qp.srq,
sizeof(struct ibv_srq *))) {
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
rc = OPAL_ERROR;
}
opal_mutex_unlock(lock);
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
BTL_VERBOSE(("Failed to close SRQ %d", qp));
rc = OPAL_ERROR;
/* Release SRQ resources */
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
if(!BTL_OPENIB_QP_TYPE_PP(qp)) {
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
MCA_BTL_OPENIB_CLEAN_PENDING_FRAGS(
&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
if (NULL != openib_btl->qps[qp].u.srq_qp.srq) {
opal_mutex_t *lock =
&mca_btl_openib_component.srq_manager.lock;

opal_hash_table_t *srq_addr_table =
&mca_btl_openib_component.srq_manager.srq_addr_table;

opal_mutex_lock(lock);
if (OPAL_SUCCESS !=
opal_hash_table_remove_value_ptr(srq_addr_table,
&openib_btl->qps[qp].u.srq_qp.srq,
sizeof(struct ibv_srq *))) {
BTL_VERBOSE(("Failed to remove SRQ %d entry from hash table.", qp));
rc = OPAL_ERROR;
}
opal_mutex_unlock(lock);
if (0 != ibv_destroy_srq(openib_btl->qps[qp].u.srq_qp.srq)) {
BTL_VERBOSE(("Failed to close SRQ %d", qp));
rc = OPAL_ERROR;
}
}
}

OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[0]);
OBJ_DESTRUCT(&openib_btl->qps[qp].u.srq_qp.pending_frags[1]);
}
}
}

/* Finalize the CPC modules on this openib module */
for (i = 0; i < openib_btl->num_cpcs; ++i) {
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
/* Finalize the CPC modules on this openib module */
for (i = 0; i < openib_btl->num_cpcs; ++i) {
if (NULL != openib_btl->cpcs[i]->cbm_finalize) {
openib_btl->cpcs[i]->cbm_finalize(openib_btl, openib_btl->cpcs[i]);
}
free(openib_btl->cpcs[i]);
}
free(openib_btl->cpcs[i]);
free(openib_btl->cpcs);
}
free(openib_btl->cpcs);

/* Release device if there are no more users */
if(!(--openib_btl->device->btls)) {
Expand Down
9 changes: 7 additions & 2 deletions opal/mca/btl/openib/btl_openib.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
* Copyright (c) 2009-2010 Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2014 Bull SAS. All rights reserved.
* Copyright (c) 2015-2016 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2015-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -164,6 +164,9 @@ struct mca_btl_openib_component_t {
int ib_num_btls;
/**< number of devices available to the openib component */

int ib_allowed_btls;
/**< number of devices allowed to the openib component */

struct mca_btl_openib_module_t **openib_btls;
/**< array of available BTLs */

Expand Down Expand Up @@ -501,6 +504,8 @@ struct mca_btl_openib_module_t {
int local_procs; /** number of local procs */

bool atomic_ops_be; /** atomic result is big endian */

bool allowed; /** is this port allowed */
};
typedef struct mca_btl_openib_module_t mca_btl_openib_module_t;

Expand Down
Loading

0 comments on commit 99d8576

Please sign in to comment.