Skip to content

Commit 7b22751

Browse files
authored
Merge pull request #7006 from mwheinz/REFS6976-3.0.x
v3.0.x: REF6976 Silent failure of OMPI over OFI with large messages sizes
2 parents 2d6e64d + 52e932c commit 7b22751

File tree

4 files changed

+17
-3
lines changed

4 files changed

+17
-3
lines changed

ompi/mca/mtl/ofi/help-mtl-ofi.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,6 @@ unusual; your job may behave unpredictably (and/or abort) after this.
1616
Local host: %s
1717
Location: %s:%d
1818
Error: %s (%zd)
19+
#
20+
[message too big]
21+
Message size %llu bigger than supported by selected transport. Max = %llu

ompi/mca/mtl/ofi/mtl_ofi.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,13 +247,20 @@ ompi_mtl_ofi_send_start(struct mca_mtl_base_module_t *mtl,
247247
endpoint = ompi_mtl_ofi_get_endpoint(mtl, ompi_proc);
248248

249249
ompi_ret = ompi_mtl_datatype_pack(convertor, &start, &length, &free_after);
250-
if (OMPI_SUCCESS != ompi_ret) return ompi_ret;
250+
if (OPAL_UNLIKELY(OMPI_SUCCESS != ompi_ret)) {
251+
return ompi_ret;
252+
}
251253

252254
ofi_req->buffer = (free_after) ? start : NULL;
253255
ofi_req->length = length;
254256
ofi_req->status.MPI_ERROR = OMPI_SUCCESS;
255257

256-
if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_SYNCHRONOUS == mode)) {
258+
if (OPAL_UNLIKELY(length > endpoint->mtl_ofi_module->max_msg_size)) {
259+
opal_show_help("help-mtl-ofi.txt",
260+
"message too big", false,
261+
length, endpoint->mtl_ofi_module->max_msg_size);
262+
return OMPI_ERROR;
263+
} else if (OPAL_UNLIKELY(MCA_PML_BASE_SEND_SYNCHRONOUS == mode)) {
257264
ack_req = malloc(sizeof(ompi_mtl_ofi_request_t));
258265
assert(ack_req);
259266
ack_req->parent = ofi_req;

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,9 +466,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
466466
}
467467

468468
/**
469-
* Save the maximum inject size.
469+
* Save the maximum sizes.
470470
*/
471471
ompi_mtl_ofi.max_inject_size = prov->tx_attr->inject_size;
472+
ompi_mtl_ofi.max_msg_size = prov->ep_attr->max_msg_size;
472473

473474
/**
474475
* Create the objects that will be bound to the endpoint.

ompi/mca/mtl/ofi/mtl_ofi_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ typedef struct mca_mtl_ofi_module_t {
4949
/** Maximum inject size */
5050
size_t max_inject_size;
5151

52+
/** Largest message that can be sent in a single send. */
53+
size_t max_msg_size;
54+
5255
/** Maximum number of CQ events to read in OFI Progress */
5356
int ofi_progress_event_count;
5457

0 commit comments

Comments
 (0)