-
Notifications
You must be signed in to change notification settings - Fork 868
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
btl/openib: do not initialize device with not allowed ports #6184
Conversation
…led. Fixes an issue introduced in open-mpi/ompi@0a2ce58 Refs. open-mpi#6137 Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
Many thanks to Sergey Oblomov for reporting this issue and the countless traces provided when troubleshooting it. Refs. open-mpi#6137 Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
@hoopoepg could you please give this PR a try ? |
the same behavior:
for |
:-( could you post the output of could you also please apply the patch below and
diff --git a/opal/mca/btl/openib/btl_openib_component.c b/opal/mca/btl/openib/btl_openib_component.c
index b523f9a..07ac335 100644
--- a/opal/mca/btl/openib/btl_openib_component.c
+++ b/opal/mca/btl/openib/btl_openib_component.c
@@ -630,6 +630,11 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
union ibv_gid gid;
uint64_t subnet_id;
+ opal_output_verbose(1, opal_btl_base_framework.framework_output,
+ "[rank=%d] openib: init_one_port(device=%s, port_num=%d",
+ OPAL_PROC_MY_NAME.vpid,
+ ibv_get_device_name(device->ib_dev),
+ port_num);
/*
* Starting with Open MPI 4.0 we don't support infiniband
* unless the user specifically requested to override this
@@ -641,6 +646,11 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
&& IBV_LINK_LAYER_INFINIBAND == ib_port_attr->link_layer
#endif
) {
+ opal_output_verbose(1, opal_btl_base_framework.framework_output,
+ "[rank=%d] openib: device=%s, port_num=%d is NOT allowed",
+ OPAL_PROC_MY_NAME.vpid,
+ ibv_get_device_name(device->ib_dev),
+ port_num);
openib_btl = (mca_btl_openib_module_t *) calloc(1, sizeof(mca_btl_openib_module_t));
if(NULL == openib_btl) {
BTL_ERROR(("Failed malloc: %s:%d", __FILE__, __LINE__));
@@ -899,6 +909,11 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
ERR_VALUE_OF_OUT_OF_BOUNDS; that is reserved
for when we exceed the number of allowable
BTLs). */
+ opal_output_verbose(1, opal_btl_base_framework.framework_output,
+ "[rank=%d] openib: device=%s, port_num=%d is UNREACHABLE",
+ OPAL_PROC_MY_NAME.vpid,
+ ibv_get_device_name(device->ib_dev),
+ port_num);
return OPAL_ERR_UNREACH;
}
}
@@ -914,11 +929,21 @@ static int init_one_port(opal_list_t *btl_list, mca_btl_openib_device_t *device,
if (-1 != mca_btl_openib_component.ib_max_btls &&
mca_btl_openib_component.ib_num_btls >=
mca_btl_openib_component.ib_max_btls) {
+ opal_output_verbose(1, opal_btl_base_framework.framework_output,
+ "[rank=%d] openib: device=%s, port_num=%d is OUT OF BOUNDS",
+ OPAL_PROC_MY_NAME.vpid,
+ ibv_get_device_name(device->ib_dev),
+ port_num);
return OPAL_ERR_VALUE_OUT_OF_BOUNDS;
}
}
}
+ opal_output_verbose(1, opal_btl_base_framework.framework_output,
+ "[rank=%d] openib: device=%s, port_num=%d IS allowed",
+ OPAL_PROC_MY_NAME.vpid,
+ ibv_get_device_name(device->ib_dev),
+ port_num);
return OPAL_SUCCESS;
}
@@ -1924,6 +1949,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
for(k = 0; k < port_cnt; k++){
struct ibv_port_attr ib_port_attr;
i = allowed_ports[k];
+ opal_output_verbose(1, opal_btl_base_framework.framework_output,
+ "[rank=%d] openib: gonna initialize device=%s, port_num=%d (%d/%d)",
+ OPAL_PROC_MY_NAME.vpid,
+ ibv_get_device_name(device->ib_dev),
+ i, k, port_cnt);
if(ibv_query_port(device->ib_dev_context, i, &ib_port_attr)){
BTL_ERROR(("error getting port attributes for device %s "
"port number %d errno says %s",
@@ -1972,6 +2002,11 @@ static int init_one_device(opal_list_t *btl_list, struct ibv_device* ib_dev)
free(allowed_ports);
allowed_ports = NULL;
+ opal_output_verbose(1, opal_btl_base_framework.framework_output,
+ "[rank=%d] openib: initialized all ports of device=%s, there are %d/%d allowed btls",
+ OPAL_PROC_MY_NAME.vpid,
+ ibv_get_device_name(device->ib_dev),
+ device->allowed_btls, device->btls);
/* If we made a BTL, check APM status and return. Otherwise, fall
through and destroy everything */
if (device->allowed_btls > 0) { |
dev_info -v:
|
does |
|
it seems yes:
|
but it still crashes on |
what does |
|
@hppritcha I added the blocker label since Open MPI 4.0.1 will be released soon. @hoopoepg reported a regression in I do believe this patch is an improvement vs the current status, so it could be merged in That being said, an other option is to revert the previous commit since a regression was reported If you decide to do nothing with respect to this PR/issue, feel free to remove the blocker and |
This PR is now moot, because openib has been removed from master. |
Refs. #6137