Skip to content

Commit 95e3323

Browse files
committed
sessions: add support for ucx more
Greatly simplify support for MPI_Comm_create_from_group and MPI_Intercomm_create_from_group by removing the need to support the 128-bit excid notion. Instead, make use of a PMIx capability - PMIX_GROUP_LOCAL_CID and the notion of PMIX_GROUP_INFO. This capability was introduced in Open PMIx 4.1.3. This capability allows us to piggy-back a local cid selected for the new communicator on the PMIx_Group_construct operation. Using this approach, a lot of the complex active message style operations implemented in the OB1 PML to support excids can be avoided. Infrastructure for debugging communicator management routines was also introduced, along with a new MCA parameter - mpi_comm_verbose. Related to #12566 Signed-off-by: Howard Pritchard <howardp@lanl.gov>
1 parent f31a9be commit 95e3323

File tree

10 files changed

+282
-59
lines changed

10 files changed

+282
-59
lines changed

ompi/communicator/comm.c

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Copyright (c) 2015 Mellanox Technologies. All rights reserved.
2525
* Copyright (c) 2017-2022 IBM Corporation. All rights reserved.
2626
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
27-
* Copyright (c) 2018-2022 Triad National Security, LLC. All rights
27+
* Copyright (c) 2018-2024 Triad National Security, LLC. All rights
2828
* reserved.
2929
* Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
3030
* $COPYRIGHT$
@@ -1738,7 +1738,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
17381738
ompi_communicator_t **newintercomm)
17391739
{
17401740
ompi_communicator_t *newcomp = NULL, *local_comm, *leader_comm = MPI_COMM_NULL;
1741-
ompi_comm_extended_cid_block_t new_block;
1741+
ompi_comm_extended_cid_block_t new_block = {0};
17421742
bool i_am_leader = local_leader == local_group->grp_my_rank;
17431743
ompi_proc_t **rprocs;
17441744
uint64_t data[4];
@@ -1864,14 +1864,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead
18641864
return rc;
18651865
}
18661866

1867-
/* will be using a communicator ID derived from the bridge communicator to save some time */
1868-
new_block.block_cid.cid_base = data[1];
1869-
new_block.block_cid.cid_sub.u64 = data[2];
1870-
new_block.block_nextsub = 0;
1871-
new_block.block_nexttag = 0;
1872-
new_block.block_level = (int8_t) data[3];
1873-
1874-
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, &new_block, false, OMPI_COMM_CID_GROUP_NEW);
1867+
rc = ompi_comm_nextcid (newcomp, NULL, NULL, (void *) tag, NULL, false, OMPI_COMM_CID_GROUP_NEW);
18751868
if ( OMPI_SUCCESS != rc ) {
18761869
OBJ_RELEASE(newcomp);
18771870
return rc;

ompi/communicator/comm_cid.c

Lines changed: 176 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -310,21 +310,16 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
310310
const void *arg0, const void *arg1, bool send_first, int mode,
311311
ompi_request_t **req)
312312
{
313-
pmix_info_t pinfo, *results = NULL;
313+
pmix_info_t *pinfo, *results = NULL;
314314
size_t nresults;
315-
opal_process_name_t *name_array = NULL;
316-
char *tag = NULL;
317-
size_t proc_count;
318-
size_t cid_base = 0;
315+
opal_process_name_t opal_proc_name;
319316
bool cid_base_set = false;
317+
char *tag = NULL;
318+
size_t proc_count = 0, rproc_count = 0, tproc_count = 0, cid_base = 0UL, ninfo;
320319
int rc, leader_rank;
321-
int ret = OMPI_SUCCESS;
322-
pmix_proc_t *procs = NULL;
323-
324-
rc = ompi_group_to_proc_name_array (newcomm->c_local_group, &name_array, &proc_count);
325-
if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
326-
return rc;
327-
}
320+
pmix_proc_t *procs;
321+
void *grpinfo = NULL, *list = NULL;
322+
pmix_data_array_t darray;
328323

329324
switch (mode) {
330325
case OMPI_COMM_CID_GROUP_NEW:
@@ -341,15 +336,75 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
341336
break;
342337
}
343338

344-
PMIX_INFO_LOAD(&pinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
339+
grpinfo = PMIx_Info_list_start();
340+
if (NULL == grpinfo) {
341+
rc = OMPI_ERR_OUT_OF_RESOURCE;
342+
goto fn_exit;
343+
}
344+
345+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_ASSIGN_CONTEXT_ID, NULL, PMIX_BOOL);
346+
if (PMIX_SUCCESS != rc) {
347+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
348+
rc = OMPI_ERR_OUT_OF_RESOURCE;
349+
goto fn_exit;
350+
}
351+
352+
list = PMIx_Info_list_start();
353+
354+
size_t c_index = (size_t)newcomm->c_index;
355+
rc = PMIx_Info_list_add(list, PMIX_GROUP_LOCAL_CID, &c_index, PMIX_SIZE);
356+
if (PMIX_SUCCESS != rc) {
357+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
358+
rc = OMPI_ERR_OUT_OF_RESOURCE;
359+
goto fn_exit;
360+
}
361+
362+
rc = PMIx_Info_list_convert(list, &darray);
363+
if (PMIX_SUCCESS != rc) {
364+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
365+
rc = OMPI_ERR_OUT_OF_RESOURCE;
366+
goto fn_exit;
367+
}
368+
rc = PMIx_Info_list_add(grpinfo, PMIX_GROUP_INFO, &darray, PMIX_DATA_ARRAY);
369+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
370+
if (PMIX_SUCCESS != rc) {
371+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
372+
rc = OMPI_ERR_OUT_OF_RESOURCE;
373+
goto fn_exit;
374+
}
375+
376+
rc = PMIx_Info_list_convert(grpinfo, &darray);
377+
if (PMIX_SUCCESS != rc) {
378+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_convert failed %s %d", PMIx_Error_string(rc), __LINE__));
379+
rc = OMPI_ERR_OUT_OF_RESOURCE;
380+
goto fn_exit;
381+
}
382+
383+
pinfo = (pmix_info_t*)darray.array;
384+
ninfo = darray.size;
385+
386+
proc_count = newcomm->c_local_group->grp_proc_count;
387+
if ( OMPI_COMM_IS_INTER (newcomm) ){
388+
rproc_count = newcomm->c_remote_group->grp_proc_count;
389+
}
390+
391+
PMIX_PROC_CREATE(procs, proc_count + rproc_count);
345392

346-
PMIX_PROC_CREATE(procs, proc_count);
347393
for (size_t i = 0 ; i < proc_count; ++i) {
348-
OPAL_PMIX_CONVERT_NAME(&procs[i],&name_array[i]);
394+
opal_proc_name = ompi_group_get_proc_name(newcomm->c_local_group, i);
395+
OPAL_PMIX_CONVERT_NAME(&procs[i],&opal_proc_name);
396+
}
397+
for (size_t i = 0; i < rproc_count; ++i) {
398+
opal_proc_name = ompi_group_get_proc_name(newcomm->c_remote_group, i);
399+
OPAL_PMIX_CONVERT_NAME(&procs[proc_count+i],&opal_proc_name);
349400
}
350401

351-
rc = PMIx_Group_construct(tag, procs, proc_count, &pinfo, 1, &results, &nresults);
352-
PMIX_INFO_DESTRUCT(&pinfo);
402+
tproc_count = proc_count + rproc_count;
403+
404+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "calling PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
405+
tag, tproc_count, ninfo, cid_base));
406+
rc = PMIx_Group_construct(tag, procs, tproc_count, pinfo, ninfo, &results, &nresults);
407+
PMIX_DATA_ARRAY_DESTRUCT(&darray);
353408
if(PMIX_SUCCESS != rc) {
354409
char msg_string[1024];
355410
switch (rc) {
@@ -361,7 +416,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
361416
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
362417
msg_string);
363418

364-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
419+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
365420
break;
366421
case PMIX_ERR_NOT_SUPPORTED:
367422
sprintf(msg_string,"PMIx server does not support PMIx Group operations");
@@ -370,10 +425,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
370425
true,
371426
"MPI_Comm_create_from_group/MPI_Intercomm_create_from_groups",
372427
msg_string);
373-
ret = MPI_ERR_UNSUPPORTED_OPERATION;
428+
rc = MPI_ERR_UNSUPPORTED_OPERATION;
374429
break;
375430
default:
376-
ret = opal_pmix_convert_status(rc);
431+
rc = opal_pmix_convert_status(rc);
377432
break;
378433
}
379434
goto fn_exit;
@@ -383,23 +438,28 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
383438
if (PMIX_CHECK_KEY(&results[i], PMIX_GROUP_CONTEXT_ID)) {
384439
PMIX_VALUE_GET_NUMBER(rc, &results[i].value, cid_base, size_t);
385440
if(PMIX_SUCCESS != rc) {
386-
ret = opal_pmix_convert_status(rc);
441+
rc = opal_pmix_convert_status(rc);
387442
goto fn_exit;
388443
}
389444
cid_base_set = true;
390445
break;
391446
}
392447
}
393448

449+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_construct - tag %s size %ld ninfo %ld cid_base %ld\n",
450+
tag, tproc_count, ninfo, cid_base));
451+
452+
/* destruct the group */
394453
rc = PMIx_Group_destruct (tag, NULL, 0);
395454
if(PMIX_SUCCESS != rc) {
396-
ret = opal_pmix_convert_status(rc);
455+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc)));
456+
rc = opal_pmix_convert_status(rc);
397457
goto fn_exit;
398458
}
399459

400460
if (!cid_base_set) {
401461
opal_show_help("help-comm.txt", "cid-base-not-set", true);
402-
ret = OMPI_ERROR;
462+
rc = OMPI_ERROR;
403463
goto fn_exit;
404464
}
405465

@@ -412,16 +472,19 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
412472
}
413473

414474
if(NULL != procs) {
415-
PMIX_PROC_FREE(procs, proc_count);
475+
PMIX_PROC_FREE(procs, tproc_count);
416476
procs = NULL;
417477
}
418478

419-
if(NULL != name_array) {
420-
free (name_array);
421-
name_array = NULL;
479+
if (NULL != grpinfo) {
480+
PMIx_Info_list_release(grpinfo);
422481
}
423482

424-
return ret;
483+
if (NULL != list) {
484+
PMIx_Info_list_release(list);
485+
}
486+
487+
return rc;
425488
}
426489

427490
static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communicator_t *comm,
@@ -446,6 +509,15 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
446509
block = &comm->c_contextidb;
447510
}
448511

512+
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
513+
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
514+
if (true == flag) {
515+
newcomm->c_index = i;
516+
break;
517+
}
518+
}
519+
assert(newcomm->c_index > 2);
520+
449521
if (NULL == arg1) {
450522
if (OMPI_COMM_CID_GROUP == mode || OMPI_COMM_CID_GROUP_NEW == mode ||
451523
!ompi_comm_extended_cid_block_available (&comm->c_contextidb)) {
@@ -468,14 +540,6 @@ static int ompi_comm_nextcid_ext_nb (ompi_communicator_t *newcomm, ompi_communic
468540
(void) ompi_comm_extended_cid_block_new (block, &newcomm->c_contextidb, is_new_block);
469541
}
470542

471-
for (unsigned int i = ompi_mpi_communicators.lowest_free ; i < mca_pml.pml_max_contextid ; ++i) {
472-
bool flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i, newcomm);
473-
if (true == flag) {
474-
newcomm->c_index = i;
475-
break;
476-
}
477-
}
478-
479543
newcomm->c_contextid = newcomm->c_contextidb.block_cid;
480544

481545
opal_hash_table_set_value_ptr (&ompi_comm_hash, &newcomm->c_contextid,
@@ -502,7 +566,7 @@ int ompi_comm_nextcid_nb (ompi_communicator_t *newcomm, ompi_communicator_t *com
502566
functions but the pml does not support these functions so return not supported */
503567
if (NULL == comm) {
504568
char msg_string[1024];
505-
sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features",
569+
sprintf(msg_string,"The PML being used - %s - does not support MPI sessions related features",
506570
mca_pml_base_selected_component.pmlm_version.mca_component_name);
507571
opal_show_help("help-comm.txt",
508572
"MPI function not supported",
@@ -886,6 +950,7 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
886950
ompi_comm_cid_context_t *context;
887951
ompi_comm_request_t *request;
888952
ompi_request_t *subreq;
953+
uint32_t comm_size;
889954
int ret = 0;
890955

891956
/* the caller should not pass NULL for comm (it may be the same as *newcomm) */
@@ -907,6 +972,25 @@ int ompi_comm_activate_nb (ompi_communicator_t **newcomm, ompi_communicator_t *c
907972

908973
request->context = &context->super;
909974

975+
/* Prep communicator for handling remote cids if needed */
976+
977+
if (!OMPI_COMM_IS_GLOBAL_INDEX(*newcomm)) {
978+
if (OMPI_COMM_IS_INTER(*newcomm)) {
979+
comm_size = ompi_comm_remote_size(*newcomm);
980+
} else {
981+
comm_size = ompi_comm_size(*newcomm);
982+
}
983+
984+
(*newcomm)->c_index_vec = (uint32_t *)calloc(comm_size, sizeof(uint32_t));
985+
if (NULL == (*newcomm)->c_index_vec) {
986+
return OMPI_ERR_OUT_OF_RESOURCE;
987+
}
988+
989+
if (OMPI_COMM_IS_INTRA(*newcomm)) {
990+
(*newcomm)->c_index_vec[(*newcomm)->c_my_rank] = (*newcomm)->c_index;
991+
}
992+
}
993+
910994
if (MPI_UNDEFINED != (*newcomm)->c_local_group->grp_my_rank) {
911995
/* Initialize the PML stuff in the newcomm */
912996
if ( OMPI_SUCCESS != (ret = MCA_PML_CALL(add_comm(*newcomm))) ) {
@@ -963,6 +1047,61 @@ int ompi_comm_activate (ompi_communicator_t **newcomm, ompi_communicator_t *comm
9631047
return rc;
9641048
}
9651049

1050+
int ompi_comm_get_remote_cid_from_pmix (ompi_communicator_t *comm, int dest, uint32_t *remote_cid)
1051+
{
1052+
ompi_proc_t *ompi_proc;
1053+
pmix_proc_t pmix_proc;
1054+
pmix_info_t tinfo[2];
1055+
pmix_value_t *val = NULL;
1056+
ompi_comm_extended_cid_t excid;
1057+
int rc = OMPI_SUCCESS;
1058+
size_t remote_cid64;
1059+
1060+
assert(NULL != remote_cid);
1061+
1062+
ompi_proc = ompi_comm_peer_lookup(comm, dest);
1063+
OPAL_PMIX_CONVERT_NAME(&pmix_proc, &ompi_proc->super.proc_name);
1064+
1065+
PMIx_Info_construct(&tinfo[0]);
1066+
PMIX_INFO_LOAD(&tinfo[0], PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
1067+
1068+
excid = ompi_comm_get_extended_cid(comm);
1069+
1070+
PMIX_INFO_CONSTRUCT(&tinfo[1]);
1071+
PMIX_INFO_LOAD(&tinfo[1], PMIX_GROUP_CONTEXT_ID, &excid.cid_base, PMIX_SIZE);
1072+
PMIX_INFO_SET_QUALIFIER(&tinfo[1]);
1073+
if (PMIX_SUCCESS != (rc = PMIx_Get(&pmix_proc, PMIX_GROUP_LOCAL_CID, tinfo, 2, &val))) {
1074+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID cid_base %ld %s", excid.cid_base, PMIx_Error_string(rc)));
1075+
rc = OMPI_ERR_NOT_FOUND;
1076+
goto done;
1077+
}
1078+
1079+
if (NULL == val) {
1080+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID val returned NULL"));
1081+
rc = OMPI_ERR_NOT_FOUND;
1082+
goto done;
1083+
}
1084+
1085+
if (val->type != PMIX_SIZE) {
1086+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get failed for PMIX_GROUP_LOCAL_CID type mismatch"));
1087+
rc = OMPI_ERR_TYPE_MISMATCH;
1088+
goto done;
1089+
}
1090+
1091+
PMIX_VALUE_GET_NUMBER(rc, val, remote_cid64, size_t);
1092+
rc = OMPI_SUCCESS;
1093+
*remote_cid = (uint32_t)remote_cid64;
1094+
comm->c_index_vec[dest] = (uint32_t)remote_cid64;
1095+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Get PMIX_GROUP_LOCAL_CID %d for cid_base %ld", *remote_cid, excid.cid_base));
1096+
1097+
done:
1098+
if (NULL != val) {
1099+
PMIX_VALUE_RELEASE(val);
1100+
}
1101+
1102+
return rc;
1103+
}
1104+
9661105
static int ompi_comm_activate_nb_complete (ompi_comm_request_t *request)
9671106
{
9681107
ompi_comm_cid_context_t *context = (ompi_comm_cid_context_t *) request->context;

0 commit comments

Comments
 (0)