Skip to content

Commit 25abbb2

Browse files
authored
Merge pull request #7808 from bwbarrett/backports/v4.1.x-collectives-updates
Backport Collective changes from master to v4.1.x
2 parents 4654149 + 7987a7f commit 25abbb2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+2027
-433
lines changed

NEWS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ included in the vX.Y.Z section and be denoted as:
6363
- OFI/libfabric: Added support for multiple NICs
6464
- OFI/libfabric: Added support for Scalable Endpoints
6565
- OFI/libfabric: Added btl for one-sided support
66+
- libnbc: Adding numerous performance-improving algorithms
6667

6768
4.0.4 -- June, 2020
6869
-----------------------

contrib/platform/mellanox/optimized.conf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# Copyright (c) 2004-2005 The Regents of the University of California.
1111
# All rights reserved.
1212
# Copyright (c) 2006 Cisco Systems, Inc. All rights reserved.
13+
# Copyright (c) 2019 Mellanox Technologies. All rights reserved.
1314
# $COPYRIGHT$
1415
#
1516
# Additional copyrights may follow
@@ -84,4 +85,8 @@ bml_r2_show_unreach_errors = 0
8485
coll_tuned_alltoall_large_msg = 250000
8586
coll_tuned_alltoall_min_procs = 2048
8687
coll_tuned_alltoall_algorithm_max_requests = 8
88+
coll_tuned_scatter_intermediate_msg = 8192
89+
coll_tuned_scatter_large_msg = 250000
90+
coll_tuned_scatter_min_procs = 1048510
91+
coll_tuned_scatter_algorithm_max_requests = 64
8792

ompi/mca/coll/base/coll_base_allgatherv.c

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,6 @@ int ompi_coll_base_allgatherv_intra_bruck(const void *sbuf, int scount,
110110
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
111111
"coll:base:allgather_intra_bruck rank %d", rank));
112112

113-
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
114-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
115-
116113
err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
117114
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
118115

@@ -238,9 +235,6 @@ int ompi_coll_base_allgatherv_intra_ring(const void *sbuf, int scount,
238235
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
239236
"coll:base:allgatherv_intra_ring rank %d", rank));
240237

241-
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
242-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
243-
244238
err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
245239
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
246240

@@ -381,9 +375,6 @@ ompi_coll_base_allgatherv_intra_neighborexchange(const void *sbuf, int scount,
381375
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
382376
"coll:base:allgatherv_intra_neighborexchange rank %d", rank));
383377

384-
err = ompi_datatype_get_extent (sdtype, &slb, &sext);
385-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
386-
387378
err = ompi_datatype_get_extent (rdtype, &rlb, &rext);
388379
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
389380

@@ -520,9 +511,6 @@ int ompi_coll_base_allgatherv_intra_two_procs(const void *sbuf, int scount,
520511
return MPI_ERR_UNSUPPORTED_OPERATION;
521512
}
522513

523-
err = ompi_datatype_get_extent (sdtype, &lb, &sext);
524-
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
525-
526514
err = ompi_datatype_get_extent (rdtype, &lb, &rext);
527515
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
528516

ompi/mca/coll/base/coll_base_allreduce.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,7 +350,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
350350
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
351351
ptrdiff_t true_lb, true_extent, lb, extent;
352352
ptrdiff_t block_offset, max_real_segsize;
353-
ompi_request_t *reqs[2] = {NULL, NULL};
353+
ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
354354

355355
size = ompi_comm_size(comm);
356356
rank = ompi_comm_rank(comm);
@@ -528,6 +528,7 @@ ompi_coll_base_allreduce_intra_ring(const void *sbuf, void *rbuf, int count,
528528
error_hndl:
529529
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
530530
__FILE__, line, rank, ret));
531+
ompi_coll_base_free_reqs(reqs, 2);
531532
(void)line; // silence compiler warning
532533
if (NULL != inbuf[0]) free(inbuf[0]);
533534
if (NULL != inbuf[1]) free(inbuf[1]);
@@ -627,7 +628,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
627628
size_t typelng;
628629
char *tmpsend = NULL, *tmprecv = NULL, *inbuf[2] = {NULL, NULL};
629630
ptrdiff_t block_offset, max_real_segsize;
630-
ompi_request_t *reqs[2] = {NULL, NULL};
631+
ompi_request_t *reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
631632
ptrdiff_t lb, extent, gap;
632633

633634
size = ompi_comm_size(comm);
@@ -847,6 +848,7 @@ ompi_coll_base_allreduce_intra_ring_segmented(const void *sbuf, void *rbuf, int
847848
error_hndl:
848849
OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tRank %d Error occurred %d\n",
849850
__FILE__, line, rank, ret));
851+
ompi_coll_base_free_reqs(reqs, 2);
850852
(void)line; // silence compiler warning
851853
if (NULL != inbuf[0]) free(inbuf[0]);
852854
if (NULL != inbuf[1]) free(inbuf[1]);

ompi/mca/coll/base/coll_base_alltoall.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,7 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
393393
if (0 < total_reqs) {
394394
reqs = ompi_coll_base_comm_get_reqs(module->base_data, 2 * total_reqs);
395395
if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
396+
reqs[0] = reqs[1] = MPI_REQUEST_NULL;
396397
}
397398

398399
prcv = (char *) rbuf;
@@ -468,6 +469,15 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
468469
return MPI_SUCCESS;
469470

470471
error_hndl:
472+
/* find a real error code */
473+
if (MPI_ERR_IN_STATUS == error) {
474+
for( ri = 0; ri < nreqs; ri++ ) {
475+
if (MPI_REQUEST_NULL == reqs[ri]) continue;
476+
if (MPI_ERR_PENDING == reqs[ri]->req_status.MPI_ERROR) continue;
477+
error = reqs[ri]->req_status.MPI_ERROR;
478+
break;
479+
}
480+
}
471481
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
472482
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
473483
rank));
@@ -661,7 +671,16 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
661671
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
662672

663673
err_hndl:
664-
if( MPI_SUCCESS != err ) {
674+
if (MPI_SUCCESS != err) {
675+
/* find a real error code */
676+
if (MPI_ERR_IN_STATUS == err) {
677+
for( i = 0; i < nreqs; i++ ) {
678+
if (MPI_REQUEST_NULL == req[i]) continue;
679+
if (MPI_ERR_PENDING == req[i]->req_status.MPI_ERROR) continue;
680+
err = req[i]->req_status.MPI_ERROR;
681+
break;
682+
}
683+
}
665684
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
666685
__FILE__, line, err, rank) );
667686
(void)line; // silence compiler warning

ompi/mca/coll/base/coll_base_alltoallv.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -276,6 +276,15 @@ ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts
276276
err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE);
277277

278278
err_hndl:
279+
/* find a real error code */
280+
if (MPI_ERR_IN_STATUS == err) {
281+
for( i = 0; i < nreqs; i++ ) {
282+
if (MPI_REQUEST_NULL == reqs[i]) continue;
283+
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
284+
err = reqs[i]->req_status.MPI_ERROR;
285+
break;
286+
}
287+
}
279288
/* Free the requests in all cases as they are persistent */
280289
ompi_coll_base_free_reqs(reqs, nreqs);
281290

ompi/mca/coll/base/coll_base_barrier.c

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -102,8 +102,10 @@ int ompi_coll_base_barrier_intra_doublering(struct ompi_communicator_t *comm,
102102
{
103103
int rank, size, err = 0, line = 0, left, right;
104104

105-
rank = ompi_comm_rank(comm);
106105
size = ompi_comm_size(comm);
106+
if( 1 == size )
107+
return OMPI_SUCCESS;
108+
rank = ompi_comm_rank(comm);
107109

108110
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"ompi_coll_base_barrier_intra_doublering rank %d", rank));
109111

@@ -172,8 +174,10 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c
172174
{
173175
int rank, size, adjsize, err, line, mask, remote;
174176

175-
rank = ompi_comm_rank(comm);
176177
size = ompi_comm_size(comm);
178+
if( 1 == size )
179+
return OMPI_SUCCESS;
180+
rank = ompi_comm_rank(comm);
177181
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
178182
"ompi_coll_base_barrier_intra_recursivedoubling rank %d",
179183
rank));
@@ -251,8 +255,10 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
251255
{
252256
int rank, size, distance, to, from, err, line = 0;
253257

254-
rank = ompi_comm_rank(comm);
255258
size = ompi_comm_size(comm);
259+
if( 1 == size )
260+
return MPI_SUCCESS;
261+
rank = ompi_comm_rank(comm);
256262
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
257263
"ompi_coll_base_barrier_intra_bruck rank %d", rank));
258264

@@ -285,16 +291,19 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
285291
int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
286292
mca_coll_base_module_t *module)
287293
{
288-
int remote, err;
294+
int remote, size, err;
295+
296+
size = ompi_comm_size(comm);
297+
if( 1 == size )
298+
return MPI_SUCCESS;
299+
if( 2 != ompi_comm_size(comm) ) {
300+
return MPI_ERR_UNSUPPORTED_OPERATION;
301+
}
289302

290303
remote = ompi_comm_rank(comm);
291304
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
292305
"ompi_coll_base_barrier_intra_two_procs rank %d", remote));
293306

294-
if (2 != ompi_comm_size(comm)) {
295-
return MPI_ERR_UNSUPPORTED_OPERATION;
296-
}
297-
298307
remote = (remote + 1) & 0x1;
299308

300309
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
@@ -324,8 +333,10 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
324333
int i, err, rank, size, line;
325334
ompi_request_t** requests = NULL;
326335

327-
rank = ompi_comm_rank(comm);
328336
size = ompi_comm_size(comm);
337+
if( 1 == size )
338+
return MPI_SUCCESS;
339+
rank = ompi_comm_rank(comm);
329340

330341
/* All non-root send & receive zero-length message. */
331342
if (rank > 0) {
@@ -367,11 +378,21 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
367378
/* All done */
368379
return MPI_SUCCESS;
369380
err_hndl:
381+
if( NULL != requests ) {
382+
/* find a real error code */
383+
if (MPI_ERR_IN_STATUS == err) {
384+
for( i = 0; i < size; i++ ) {
385+
if (MPI_REQUEST_NULL == requests[i]) continue;
386+
if (MPI_ERR_PENDING == requests[i]->req_status.MPI_ERROR) continue;
387+
err = requests[i]->req_status.MPI_ERROR;
388+
break;
389+
}
390+
}
391+
ompi_coll_base_free_reqs(requests, size);
392+
}
370393
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
371394
__FILE__, line, err, rank) );
372395
(void)line; // silence compiler warning
373-
if( NULL != requests )
374-
ompi_coll_base_free_reqs(requests, size);
375396
return err;
376397
}
377398
/* copied function (with appropriate renaming) ends here */
@@ -385,8 +406,10 @@ int ompi_coll_base_barrier_intra_tree(struct ompi_communicator_t *comm,
385406
{
386407
int rank, size, depth, err, jump, partner;
387408

388-
rank = ompi_comm_rank(comm);
389409
size = ompi_comm_size(comm);
410+
if( 1 == size )
411+
return MPI_SUCCESS;
412+
rank = ompi_comm_rank(comm);
390413
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
391414
"ompi_coll_base_barrier_intra_tree %d",
392415
rank));

ompi/mca/coll/base/coll_base_bcast.c

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -214,13 +214,29 @@ ompi_coll_base_bcast_intra_generic( void* buffer,
214214
return (MPI_SUCCESS);
215215

216216
error_hndl:
217-
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
218-
__FILE__, line, err, rank) );
219-
(void)line; // silence compiler warnings
217+
if (MPI_ERR_IN_STATUS == err) {
218+
for( req_index = 0; req_index < 2; req_index++ ) {
219+
if (MPI_REQUEST_NULL == recv_reqs[req_index]) continue;
220+
if (MPI_ERR_PENDING == recv_reqs[req_index]->req_status.MPI_ERROR) continue;
221+
err = recv_reqs[req_index]->req_status.MPI_ERROR;
222+
break;
223+
}
224+
}
220225
ompi_coll_base_free_reqs( recv_reqs, 2);
221226
if( NULL != send_reqs ) {
227+
if (MPI_ERR_IN_STATUS == err) {
228+
for( req_index = 0; req_index < tree->tree_nextsize; req_index++ ) {
229+
if (MPI_REQUEST_NULL == send_reqs[req_index]) continue;
230+
if (MPI_ERR_PENDING == send_reqs[req_index]->req_status.MPI_ERROR) continue;
231+
err = send_reqs[req_index]->req_status.MPI_ERROR;
232+
break;
233+
}
234+
}
222235
ompi_coll_base_free_reqs(send_reqs, tree->tree_nextsize);
223236
}
237+
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
238+
__FILE__, line, err, rank) );
239+
(void)line; // silence compiler warnings
224240

225241
return err;
226242
}
@@ -630,7 +646,9 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count,
630646

631647
/* Root sends data to all others. */
632648
preq = reqs = ompi_coll_base_comm_get_reqs(module->base_data, size-1);
633-
if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; goto err_hndl; }
649+
if( NULL == reqs ) {
650+
return OMPI_ERR_OUT_OF_RESOURCE;
651+
}
634652

635653
for (i = 0; i < size; ++i) {
636654
if (i == rank) {
@@ -649,12 +667,21 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count,
649667
* care what the error was -- just that there *was* an error. The
650668
* PML will finish all requests, even if one or more of them fail.
651669
* i.e., by the end of this call, all the requests are free-able.
652-
* So free them anyway -- even if there was an error, and return
653-
* the error after we free everything. */
670+
* So free them anyway -- even if there was an error.
671+
* Note we still need to get the actual error, as collective
672+
* operations cannot return MPI_ERR_IN_STATUS.
673+
*/
654674

655675
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
656676
err_hndl:
657677
if( MPI_SUCCESS != err ) { /* Free the reqs */
678+
/* first find the real error code */
679+
for( preq = reqs; preq < reqs+i; preq++ ) {
680+
if (MPI_REQUEST_NULL == *preq) continue;
681+
if (MPI_ERR_PENDING == (*preq)->req_status.MPI_ERROR) continue;
682+
err = (*preq)->req_status.MPI_ERROR;
683+
break;
684+
}
658685
ompi_coll_base_free_reqs(reqs, i);
659686
}
660687

ompi/mca/coll/base/coll_base_functions.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* and Technology (RIST). All rights reserved.
1919
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
2020
* Copyright (c) 2017 FUJITSU LIMITED. All rights reserved.
21+
* Copyright (c) 2019 Mellanox Technologies. All rights reserved.
2122
* $COPYRIGHT$
2223
*
2324
* Additional copyrights may follow
@@ -291,6 +292,7 @@ int ompi_coll_base_scan_intra_recursivedoubling(SCAN_ARGS);
291292
/* Scatter */
292293
int ompi_coll_base_scatter_intra_basic_linear(SCATTER_ARGS);
293294
int ompi_coll_base_scatter_intra_binomial(SCATTER_ARGS);
295+
int ompi_coll_base_scatter_intra_linear_nb(SCATTER_ARGS, int max_reqs);
294296

295297
/* ScatterV */
296298

ompi/mca/coll/base/coll_base_gather.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,15 @@ ompi_coll_base_gather_intra_linear_sync(const void *sbuf, int scount,
326326
return MPI_SUCCESS;
327327
error_hndl:
328328
if (NULL != reqs) {
329+
/* find a real error code */
330+
if (MPI_ERR_IN_STATUS == ret) {
331+
for( i = 0; i < size; i++ ) {
332+
if (MPI_REQUEST_NULL == reqs[i]) continue;
333+
if (MPI_ERR_PENDING == reqs[i]->req_status.MPI_ERROR) continue;
334+
ret = reqs[i]->req_status.MPI_ERROR;
335+
break;
336+
}
337+
}
329338
ompi_coll_base_free_reqs(reqs, size);
330339
}
331340
OPAL_OUTPUT (( ompi_coll_base_framework.framework_output,

0 commit comments

Comments
 (0)