Skip to content

Commit

Permalink
btl/openib: immediately release the device when no port is allowed
Browse files Browse the repository at this point in the history
Many thanks to Sergey Oblomov for reporting this issue
and the countless traces provided when troubleshooting it.

This is a one-off commit for the v4.0.x branch since btl/openib has been removed
 from master.

Refs. #6137

Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
  • Loading branch information
ggouaillardet committed Mar 19, 2019
1 parent c58c774 commit 8da4605
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 8 deletions.
10 changes: 5 additions & 5 deletions opal/mca/btl/openib/btl_openib.c
Original file line number Diff line number Diff line change
Expand Up @@ -1045,7 +1045,7 @@ int mca_btl_openib_add_procs(
opal_bitmap_clear_all_bits(reachable);
opal_show_help("help-mpi-btl-openib.txt", "ib port not selected",
true, opal_process_info.nodename,
ibv_get_device_name(openib_btl->device->ib_dev), openib_btl->port_num);
openib_btl->device_name, openib_btl->port_num);
return OPAL_SUCCESS;
}

Expand Down Expand Up @@ -1718,11 +1718,11 @@ static int mca_btl_openib_finalize_resources(struct mca_btl_base_module_t* btl)
free(openib_btl->cpcs[i]);
}
free(openib_btl->cpcs);
}

/* Release device if there are no more users */
if(!(--openib_btl->device->btls)) {
OBJ_RELEASE(openib_btl->device);
/* Release device if there are no more users */
if(!(--openib_btl->device->allowed_btls)) {
OBJ_RELEASE(openib_btl->device);
}
}

if (NULL != openib_btl->qps) {
Expand Down
2 changes: 2 additions & 0 deletions opal/mca/btl/openib/btl_openib.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,7 @@ typedef struct mca_btl_openib_device_t {
/* Whether this device supports eager RDMA */
uint8_t use_eager_rdma;
uint8_t btls; /** < number of btls using this device */
uint8_t allowed_btls; /** < number of allowed btls using this device */
opal_pointer_array_t *endpoints;
opal_pointer_array_t *device_btls;
uint16_t hp_cq_polls;
Expand Down Expand Up @@ -483,6 +484,7 @@ struct mca_btl_openib_module_t {
uint8_t num_cpcs;

mca_btl_openib_device_t *device;
char * device_name;
uint8_t port_num; /**< ID of the PORT */
uint16_t pkey_index;
struct ibv_port_attr ib_port_attr;
Expand Down
14 changes: 11 additions & 3 deletions opal/mca/btl/openib/btl_openib_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -648,9 +648,10 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
sizeof(mca_btl_openib_module));
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
openib_btl->device = device;
openib_btl->port_num = (uint8_t) port_num;
openib_btl->allowed = false;
openib_btl->device = NULL;
openib_btl->device_name = strdup(ibv_get_device_name(device->ib_dev));
OBJ_CONSTRUCT(&openib_btl->ib_lock, opal_mutex_t);
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
Expand Down Expand Up @@ -784,6 +785,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
ib_selected = OBJ_NEW(mca_btl_base_selected_module_t);
ib_selected->btl_module = (mca_btl_base_module_t*) openib_btl;
openib_btl->device = device;
openib_btl->device_name = NULL;
openib_btl->port_num = (uint8_t) port_num;
openib_btl->pkey_index = pkey_index;
openib_btl->lid = lid;
Expand Down Expand Up @@ -904,6 +906,7 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
opal_list_append(btl_list, (opal_list_item_t*) ib_selected);
opal_pointer_array_add(device->device_btls, (void*) openib_btl);
++device->btls;
++device->allowed_btls;
++mca_btl_openib_component.ib_num_btls;
++mca_btl_openib_component.ib_allowed_btls;
if (-1 != mca_btl_openib_component.ib_max_btls &&
Expand Down Expand Up @@ -1933,7 +1936,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
if (ib_port_attr.active_mtu < device->mtu){
device->mtu = ib_port_attr.active_mtu;
}
if (mca_btl_openib_component.apm_ports && device->btls > 0) {
if (mca_btl_openib_component.apm_ports && device->allowed_btls > 0) {
init_apm_port(device, i, ib_port_attr.lid);
break;
}
Expand Down Expand Up @@ -1969,7 +1972,7 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)

/* If we made a BTL, check APM status and return. Otherwise, fall
through and destroy everything */
if (device->btls > 0) {
if (device->allowed_btls > 0) {
/* if apm was enabled it should be > 1 */
if (1 == mca_btl_openib_component.apm_ports) {
opal_show_help("help-mpi-btl-openib.txt",
Expand Down Expand Up @@ -2290,6 +2293,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
good:
mca_btl_openib_component.devices_count++;
return OPAL_SUCCESS;
} else if (device->btls > 0) {
/* no port is allowed to be used by btl/openib,
* so release the device right away */
OBJ_RELEASE(device);
return OPAL_SUCCESS;
}

error:
Expand Down

0 comments on commit 8da4605

Please sign in to comment.