From 8fedcc0a7cc7a24231587c6d4ca39a5bf72d4d05 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 9 Feb 2022 17:33:42 +0000 Subject: [PATCH 01/13] btl/am-rdma: Pass the correct pointer to get The put over get implementation passed the wrong pointer to the cbdata of the get call, resulting in all kinds of badness when the get completed. This patch passes through the expected pointer. Signed-off-by: Brian Barrett --- opal/mca/btl/base/btl_base_am_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opal/mca/btl/base/btl_base_am_rdma.c b/opal/mca/btl/base/btl_base_am_rdma.c index 2b1e3400195..ae9683936e6 100644 --- a/opal/mca/btl/base/btl_base_am_rdma.c +++ b/opal/mca/btl/base/btl_base_am_rdma.c @@ -701,7 +701,7 @@ static int am_rdma_target_put(mca_btl_base_module_t *btl, (struct mca_btl_base_registration_handle_t *) (*operation)->local_handle_data, (struct mca_btl_base_registration_handle_t *) (*operation)->remote_handle_data, hdr->data.rdma.size, /*flags=*/0, MCA_BTL_NO_ORDER, am_rdma_rdma_complete, - operation, NULL); + *operation, NULL); if (OPAL_SUCCESS != ret) { OBJ_RELEASE(*operation); } From b6b16a6cb769306d763859236e232d8a7e0e9fce Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Fri, 7 Jan 2022 17:35:56 +0000 Subject: [PATCH 02/13] osc/rdma: Clean up initialization debugging Clean up language around btl selection phases to match rest of the code (accelerated/alternate). Also switch component initialization code to use opal_output_verbose rather than OSC_RDMA_VERBOSE, so that the debugging prints can be enabled on a non-debug build. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma_component.c | 72 ++++++++++++++++---------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index cdc71ad3056..31b47cb578f 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -442,7 +442,8 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s int ret, my_rank; size_t memory_alignment = module->memory_alignment; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating private internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "allocating private internal state"); my_rank = ompi_comm_rank (module->comm); @@ -598,7 +599,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s return allocate_state_single (module, base, size); } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocating shared internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "allocating shared internal state"); local_rank_array_size = sizeof (ompi_osc_rdma_rank_data_t) * RANK_ARRAY_COUNT (module); leader_peer_data_size = module->region_size * module->node_count; @@ -654,7 +656,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ret = opal_shmem_segment_create (&module->seg_ds, data_file, total_size); free (data_file); if (OPAL_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create shared memory segment"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create shared memory segment"); } } } @@ -672,7 +675,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s module->segment_base = opal_shmem_segment_attach (&module->seg_ds); if (NULL == module->segment_base) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to attach to the shared memory segment"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to attach to the shared memory segment"); ret = OPAL_ERROR; } @@ -898,7 +902,8 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); if (NULL == btls_to_use) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); return OMPI_ERR_UNREACH; } @@ -908,20 +913,23 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o /* rdma and atomics are only supported with BTLs at the moment */ for (int i = 0 ; btls_to_use[i] ; ++i) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { if (NULL != item->btl_module->btl_register_mem) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping RDMA btl when searching for alternate BTL"); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "skipping RDMA btl when searching for alternate BTL"); continue; } if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping btl %s", - item->btl_module->btl_component->btl_version.mca_component_name); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "skipping btl %s", + item->btl_module->btl_component->btl_version.mca_component_name); continue; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", btls_to_use[i]); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "found alternate btl %s", btls_to_use[i]); ++btls_found; if (module) { @@ -1089,7 +1097,8 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi } if (NULL == selected_btl) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no suitable btls found"); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "accelerated_query: no suitable btls found"); return OMPI_ERR_NOT_AVAILABLE; } @@ -1100,8 +1109,9 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi module->use_memory_registration = selected_btl->btl_register_mem != NULL; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "selected btl: %s", - selected_btl->btl_component->btl_version.mca_component_name); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "accelerated_query: selected btl: %s", + selected_btl->btl_component->btl_version.mca_component_name); return OMPI_SUCCESS; } @@ -1150,7 +1160,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module) module->region_size, MPI_BYTE, module->local_leaders, module->local_leaders->c_coll->coll_allgather_module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "leader allgather failed with ompi error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "leader allgather failed with ompi error code %d", ret); break; } } @@ -1193,7 +1204,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) /* create a shared communicator to handle communication about the local segment */ ret = ompi_comm_split_type (module->comm, MPI_COMM_TYPE_SHARED, 0, NULL, &module->shared_comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create a shared memory communicator. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create a shared memory communicator. error code %d", ret); return ret; } @@ -1204,7 +1216,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) ret = ompi_comm_split (module->comm, (0 == local_rank) ? 0 : MPI_UNDEFINED, comm_rank, &module->local_leaders, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to create local leaders communicator. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to create local leaders communicator. error code %d", ret); return ret; } @@ -1217,7 +1230,8 @@ static int ompi_osc_rdma_create_groups (ompi_osc_rdma_module_t *module) ret = module->shared_comm->c_coll->coll_bcast (values, 2, MPI_INT, 0, module->shared_comm, module->shared_comm->c_coll->coll_bcast_module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to broadcast local data. error code %d", ret); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to broadcast local data. error code %d", ret); return ret; } } @@ -1350,8 +1364,9 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, return ret; } - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "creating osc/rdma window of flavor %d with id %s", - flavor, ompi_comm_print_cid (module->comm)); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "creating osc/rdma window of flavor %d with id %s", + flavor, ompi_comm_print_cid (module->comm)); /* peer data */ if (world_size > init_limit) { @@ -1372,11 +1387,13 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* find rdma capable endpoints */ ret = ompi_osc_rdma_query_accelerated_btls (module->comm, module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "could not find a suitable btl. falling back on " - "active-message BTLs"); + opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output, + "could not find an accelerated btl. falling back on " + "active-message BTLs"); ret = ompi_osc_rdma_query_alternate_btls (module->comm, module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_WARN, "no BTL available for RMA window"); + opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output, + "no BTL available for RMA window"); ompi_osc_rdma_free (win); return ret; } @@ -1428,7 +1445,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, /* notify all others if something went wrong */ ret = synchronize_errorcode(ret, module->comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to allocate internal state"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to allocate internal state"); ompi_osc_rdma_free (win); return ret; } @@ -1479,14 +1497,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, ret = ompi_osc_rdma_share_data (module); if (OMPI_SUCCESS != ret) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "failed to share window data with peers"); + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "failed to share window data with peers"); ompi_osc_rdma_free (win); } else { /* for now the leader is always rank 0 in the communicator */ module->leader = ompi_osc_rdma_module_peer (module, 0); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "finished creating osc/rdma window with id %s", - ompi_comm_print_cid(module->comm)); + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "finished creating osc/rdma window with id %s", + ompi_comm_print_cid(module->comm)); } return ret; From 61aca9e2de7d35c9016f1dc2cf1750c7ab2b559f Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 4 Jan 2022 17:06:54 +0000 Subject: [PATCH 03/13] osc/rdma: Load all btls as alternate btls See lengthy comment in the change, but this patch removes the ability of users to specify a subset of available btls for use by the osc rdma component. The BTL interface was never designed for such usage (which is why there is no similar option for the OB1 PML) and it had clear places where it broke, so remove it. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma_component.c | 129 ++++++++++++++----------- 1 file changed, 71 insertions(+), 58 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 31b47cb578f..55306f8af86 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -35,6 +35,7 @@ #include "ompi_config.h" #include +#include #include "osc_rdma.h" #include "osc_rdma_frag.h" @@ -84,7 +85,6 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, const char *value); static char *ompi_osc_rdma_full_connectivity_btls; -static char *ompi_osc_rdma_btl_alternate_names; static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = { {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"}, @@ -257,14 +257,6 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_full_connectivity_btls); free(description_str); - ompi_osc_rdma_btl_alternate_names = "sm,tcp"; - opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying " - "connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names); - free(description_str); - if (0 == access ("/dev/shm", W_OK)) { mca_osc_rdma_component.backing_directory = "/dev/shm"; } else { @@ -875,76 +867,97 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) free(procs); } + +/* + * qsort() sorting function for ompi_osc_rdma_query_alternate_btls(), + * using latency as the sorting metric. + */ +static int btl_latency_sort_fn(const void *a, const void *b) +{ + const struct mca_btl_base_module_t *btl_a = a; + const struct mca_btl_base_module_t *btl_b = b; + + if (btl_a->btl_latency < btl_b->btl_latency) { + return -1; + } else if (btl_a->btl_latency == btl_b->btl_latency) { + return 0; + } else { + return 1; + } +} + + /** * @brief query for alternate BTLs * * @in comm Communicator to query - * @out module OSC module to store BTLs/count to (optional) - * @out + * @inout module OSC module to store BTLs/count to (optional) * * @return OMPI_SUCCESS if BTLs can be found * @return OMPI_ERR_UNREACH if no BTLs can be found that match * - * In this case an "alternate" BTL is a BTL does not meet the - * requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls(). - * Either it does not provide connectivity to all peers, provide - * remote completion, or natively support put/get/atomic.. Since more - * than one BTL may be needed for this support the OSC component will - * disable the use of registration-based RDMA (these BTLs will not be - * used) and will use any remaining BTL. By default the BTLs used will - * be tcp and sm but any single (or pair) of BTLs may be used. + * We directly use the active message rdma wrappers for alternate + * BTLs, in all cases. This greatly simplifies the alternate BTL + * impementation, at the expense of some performance. With the + * AM wrappers, we can always enforce remote completion and the lack + * of memory registration, at some performance cost. But we can use + * as many BTLs as we like. The module's btl list is sorted by + * latency, so that ompi_osc_rdma_peer_btl_endpoint() picks the lowest + * available latency btl to communicate with the peer. Unlike the OB1 + * PML, we only use one BTL per peer. + * + * Like the OB1 PML, there is no verification that there is at least + * one BTL that can communicate with every other peer in the window. */ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { mca_btl_base_selected_module_t *item; - char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); - int btls_found = 0; - - btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); - if (NULL == btls_to_use) { - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); - return OMPI_ERR_UNREACH; - } + int ret; - if (module) { - module->btls_in_use = 0; + /* shortcut the trivial query case */ + if (NULL == module) { + if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { + return OMPI_ERR_UNREACH; + } + return OMPI_SUCCESS; } - /* rdma and atomics are only supported with BTLs at the moment */ - for (int i = 0 ; btls_to_use[i] ; ++i) { - opal_output_verbose(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); - OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { - if (NULL != item->btl_module->btl_register_mem) { - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "skipping RDMA btl when searching for alternate BTL"); - continue; - } - - if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "skipping btl %s", - item->btl_module->btl_component->btl_version.mca_component_name); - continue; - } - - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, - "found alternate btl %s", btls_to_use[i]); - - ++btls_found; - if (module) { - mca_btl_base_am_rdma_init(item->btl_module); - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); - } - + module->btls_in_use = 0; + + /* add all alternate btls to the selected_btls list, not worrying + about ordering yet. We have to add all btls unless we want to + iterate over all endpoints to build the minimum set of btls + needed to communicate with all peers. An MCA parameter just + for osc rdma also wouldn't work, as the BML can decide not to + add an endpoint for a btl given the priority of another btl. + For example, it is not uncommon that the only endpoint created + to a peer on the same host is the sm btl's endpoint. If we + had an osc rdma specific parameter list, and the user + specified a combination not including sm, that would result in + an eventual failure, as no btl would be found to talk to ranks + on the same host.*/ + OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, + "found alternate btl %s", + item->btl_module->btl_component->btl_version.mca_component_name); + ret = mca_btl_base_am_rdma_init(item->btl_module); + if (OMPI_SUCCESS != ret) { + return ret; } + ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); } - opal_argv_free (btls_to_use); + /* sort based on latency, lowest first */ + qsort(module->selected_btls, module->btls_in_use, + sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); - return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; + /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */ + module->use_memory_registration = false; + + return module->btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; } + /* Check for BTL requirements: * 1) RDMA (put/get) and ATOMIC operations. We only require cswap * and fetch and add and will emulate other opterations with those From e93f041aa291d19c2c482023ae88ffc149c01395 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 10 Jan 2022 22:51:03 +0000 Subject: [PATCH 04/13] osc/rdma: Simplify query logic With the change to load all btls in the alternate case and the removal of the logic to change priority based on alternate or accelerated btls, there's no point in running the full btl query in osc_rdma_component_query(). Instead, just verify that there is some number of BTLs available, which is the extent of testing in the alternate case. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma_component.c | 42 +++++++++----------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 55306f8af86..eb2f0303650 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -387,15 +387,14 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s } #endif /* OPAL_CUDA_SUPPORT */ - if (OMPI_SUCCESS == ompi_osc_rdma_query_accelerated_btls (comm, NULL)) { - return mca_osc_rdma_component.priority; - } - - if (OMPI_SUCCESS == ompi_osc_rdma_query_alternate_btls (comm, NULL)) { - return mca_osc_rdma_component.priority; + /* verify if we have any btls available. Since we do not verify + * connectivity across all btls in the alternate case, this is as + * good a test as we are going to have for success. */ + if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { + return -1; } - return -1; + return OMPI_SUCCESS;; } static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void **base, size_t size) { @@ -914,13 +913,7 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o mca_btl_base_selected_module_t *item; int ret; - /* shortcut the trivial query case */ - if (NULL == module) { - if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { - return OMPI_ERR_UNREACH; - } - return OMPI_SUCCESS; - } + assert(NULL != module); module->btls_in_use = 0; @@ -988,9 +981,6 @@ static bool ompi_osc_rdma_check_accelerated_btl(struct mca_btl_base_module_t *bt * Testing (1) is expensive, so as an optimization, the * ompi_osc_rdma_full_connectivity_btls list contains the list of BTL * components we know can achieve (1) in almost all usage scenarios. - * - * If module is NULL, the code acts as a query mechanism to find any - * potential BTLs, and is used to implement osc_rdma_query(). */ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { @@ -999,11 +989,11 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi mca_bml_base_endpoint_t *base_endpoint; char **btls_to_use; - if (module) { - ompi_osc_rdma_selected_btl_insert(module, NULL, 0); - module->btls_in_use = 0; - module->use_memory_registration = false; - } + assert(NULL != module); + + ompi_osc_rdma_selected_btl_insert(module, NULL, 0); + module->btls_in_use = 0; + module->use_memory_registration = false; /* Check for BTLs in the list of BTLs we know can reach all peers in general usage. */ @@ -1116,11 +1106,9 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi } btl_selection_complete: - if (module) { - ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); - module->btls_in_use = 1; - module->use_memory_registration = selected_btl->btl_register_mem != NULL; - } + ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); + module->btls_in_use = 1; + module->use_memory_registration = selected_btl->btl_register_mem != NULL; opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, "accelerated_query: selected btl: %s", From 0394543786a55e90354829d80a5ba6eafa2c61b7 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 4 Jan 2022 17:07:26 +0000 Subject: [PATCH 05/13] osc/rdma: Refactor btl wrapper code Refactor wrapper calls around the btl atomic code into their own header, removing them from the lock interface header. Clean up some resulting header dependency changes. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/Makefile.am | 4 + ompi/mca/osc/rdma/osc_rdma_accumulate.c | 4 + ompi/mca/osc/rdma/osc_rdma_active_target.c | 27 --- ompi/mca/osc/rdma/osc_rdma_btl_comm.c | 61 ++++++ ompi/mca/osc/rdma/osc_rdma_btl_comm.h | 218 +++++++++++++++++++++ ompi/mca/osc/rdma/osc_rdma_comm.c | 3 + ompi/mca/osc/rdma/osc_rdma_comm.h | 3 +- ompi/mca/osc/rdma/osc_rdma_lock.h | 197 +------------------ 8 files changed, 295 insertions(+), 222 deletions(-) create mode 100644 ompi/mca/osc/rdma/osc_rdma_btl_comm.c create mode 100644 ompi/mca/osc/rdma/osc_rdma_btl_comm.h diff --git a/ompi/mca/osc/rdma/Makefile.am b/ompi/mca/osc/rdma/Makefile.am index e52d0087743..4757ce6aa93 100644 --- a/ompi/mca/osc/rdma/Makefile.am +++ b/ompi/mca/osc/rdma/Makefile.am @@ -11,6 +11,8 @@ # Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights # reserved. # Copyright (c) 2017 IBM Corporation. All rights reserved. +# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. +# All Rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -21,6 +23,8 @@ rdma_sources = \ osc_rdma.h \ osc_rdma_module.c \ + osc_rdma_btl_comm.h \ + osc_rdma_btl_comm.c \ osc_rdma_comm.h \ osc_rdma_comm.c \ osc_rdma_accumulate.c \ diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index ab0b21e539a..faa71692250 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -10,6 +10,8 @@ * Copyright (c) 2019-2021 Google, LLC. All rights reserved. * Copyright (c) 2021 IBM Corporation. All rights reserved. * Copyright (c) 2022 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -20,6 +22,8 @@ #include "osc_rdma_accumulate.h" #include "osc_rdma_request.h" #include "osc_rdma_comm.h" +#include "osc_rdma_lock.h" +#include "osc_rdma_btl_comm.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" diff --git a/ompi/mca/osc/rdma/osc_rdma_active_target.c b/ompi/mca/osc/rdma/osc_rdma_active_target.c index fdd3dd5c832..3706a098f77 100644 --- a/ompi/mca/osc/rdma/osc_rdma_active_target.c +++ b/ompi/mca/osc/rdma/osc_rdma_active_target.c @@ -77,33 +77,6 @@ OBJ_CLASS_INSTANCE(ompi_osc_rdma_pending_op_t, opal_list_item_t, ompi_osc_rdma_pending_op_construct, ompi_osc_rdma_pending_op_destruct); -/** - * Dummy completion function for atomic operations - */ -void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status) -{ - ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; - - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status); - - if (pending_op->op_result) { - memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); - } - - if (NULL != pending_op->cbfunc) { - pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status); - } - - if (NULL != pending_op->op_frag) { - ompi_osc_rdma_frag_complete (pending_op->op_frag); - pending_op->op_frag = NULL; - } - - pending_op->op_complete = true; - OBJ_RELEASE(pending_op); -} /** * compare_ranks: diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.c b/ompi/mca/osc/rdma/osc_rdma_btl_comm.c new file mode 100644 index 00000000000..cbc8a761593 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.c @@ -0,0 +1,61 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2004-2005 The Trustees of Indiana University. + * All rights reserved. + * Copyright (c) 2004-2005 The Trustees of the University of Tennessee. + * All rights reserved. + * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, + * University of Stuttgart. All rights reserved. + * Copyright (c) 2004-2005 The Regents of the University of California. + * All rights reserved. + * Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2010 IBM Corporation. All rights reserved. + * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. + * Copyright (c) 2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2017 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2017-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "osc_rdma.h" +#include "osc_rdma_frag.h" +#include "osc_rdma_btl_comm.h" + +#include "opal/mca/btl/base/base.h" + +void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status) +{ + ompi_osc_rdma_pending_op_t *pending_op = (ompi_osc_rdma_pending_op_t *) context; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "pending atomic %p complete with status %d", (void*)pending_op, status); + + if (pending_op->op_result) { + memmove (pending_op->op_result, pending_op->op_buffer, pending_op->op_size); + } + + if (NULL != pending_op->cbfunc) { + pending_op->cbfunc (pending_op->cbdata, pending_op->cbcontext, status); + } + + if (NULL != pending_op->op_frag) { + ompi_osc_rdma_frag_complete (pending_op->op_frag); + pending_op->op_frag = NULL; + } + + pending_op->op_complete = true; + OBJ_RELEASE(pending_op); +} diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h new file mode 100644 index 00000000000..3f175a20fb6 --- /dev/null +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights + * reserved. + * Copyright (c) 2019 Triad National Security, LLC. All rights + * reserved. + * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OSC_RDMA_BTL_COMM_H +#define OSC_RDMA_BTL_COMM_H + +#include "osc_rdma_frag.h" + +#include "opal/mca/btl/btl.h" + + +void ompi_osc_rdma_atomic_complete(mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, + void *local_address, mca_btl_base_registration_handle_t *local_handle, + void *context, void *data, int status); + + +static inline int +ompi_osc_rdma_btl_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, int op, + int64_t operand, int flags, int64_t *result, const bool wait_for_completion, + ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) +{ + ompi_osc_rdma_pending_op_t *pending_op; + mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + int ret = OPAL_ERROR; + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + + if (!wait_for_completion) { + /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ + pending_op->module = module; + (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); + } + + pending_op->op_result = (void *) result; + pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + OBJ_RETAIN(pending_op); + if (cbfunc) { + pending_op->cbfunc = cbfunc; + pending_op->cbdata = cbdata; + pending_op->cbcontext = cbcontext; + } + + /* spin until the btl has accepted the operation */ + do { + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); + } + + if (NULL != pending_op->op_frag) { + ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer, + (intptr_t) address, pending_op->op_frag->handle, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); + } + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + if (OPAL_LIKELY(1 == ret)) { + *result = ((int64_t *) pending_op->op_buffer)[0]; + ret = OMPI_SUCCESS; + ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer, + pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS); + } else { + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + } + } else if (wait_for_completion) { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + + +static inline int +ompi_osc_rdma_btl_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, + int op, int64_t operand, int flags, const bool wait_for_completion, + ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) +{ + ompi_osc_rdma_pending_op_t *pending_op; + mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + int ret; + + if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, + NULL, wait_for_completion, cbfunc, cbdata, cbcontext); + } + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + OBJ_RETAIN(pending_op); + if (cbfunc) { + pending_op->cbfunc = cbfunc; + pending_op->cbdata = cbdata; + pending_op->cbcontext = cbcontext; + } + + if (!wait_for_completion) { + /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ + pending_op->module = module; + (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); + } + + /* spin until the btl has accepted the operation */ + do { + ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + if (OPAL_LIKELY(1 == ret)) { + if (cbfunc) { + cbfunc (cbdata, cbcontext, OMPI_SUCCESS); + } + ret = OMPI_SUCCESS; + } + } else if (wait_for_completion) { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + + +static inline int +ompi_osc_rdma_btl_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, uint64_t address, + mca_btl_base_registration_handle_t *address_handle, + int64_t compare, int64_t value, int flags, int64_t *result) +{ + ompi_osc_rdma_pending_op_t *pending_op; + mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); + int ret; + + pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); + assert (NULL != pending_op); + + OBJ_RETAIN(pending_op); + + pending_op->op_result = (void *) result; + pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; + + /* spin until the btl has accepted the operation */ + do { + if (NULL == pending_op->op_frag) { + ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); + } + if (NULL != pending_op->op_frag) { + ret = selected_btl->btl_atomic_cswap (selected_btl, endpoint, pending_op->op_buffer, + address, pending_op->op_frag->handle, address_handle, compare, + value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, + NULL); + } + + if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { + break; + } + ompi_osc_rdma_progress (module); + } while (1); + + if (OPAL_SUCCESS != ret) { + if (OPAL_LIKELY(1 == ret)) { + *result = ((int64_t *) pending_op->op_buffer)[0]; + ret = OMPI_SUCCESS; + } + + /* need to release here because ompi_osc_rdma_atomic_complete was not called */ + OBJ_RELEASE(pending_op); + } else { + while (!pending_op->op_complete) { + ompi_osc_rdma_progress (module); + } + } + + OBJ_RELEASE(pending_op); + + return ret; +} + +#endif /* OSC_RDMA_BTL_COMM_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 449bbea0641..0a7bf3285b2 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -6,6 +6,8 @@ * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2017 IBM Corporation. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -14,6 +16,7 @@ */ #include "osc_rdma_comm.h" +#include "osc_rdma_frag.h" #include "osc_rdma_sync.h" #include "osc_rdma_request.h" #include "osc_rdma_dynamic.h" diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index efb305a571e..7b722c4fe69 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -4,6 +4,8 @@ * reserved. * Copyright (c) 2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,7 +19,6 @@ #include "osc_rdma_dynamic.h" #include "osc_rdma_request.h" #include "osc_rdma_sync.h" -#include "osc_rdma_lock.h" #define OMPI_OSC_RDMA_DECODE_MAX 64 diff --git a/ompi/mca/osc/rdma/osc_rdma_lock.h b/ompi/mca/osc/rdma/osc_rdma_lock.h index 36a30a1cc0b..19a3249bdbe 100644 --- a/ompi/mca/osc/rdma/osc_rdma_lock.h +++ b/ompi/mca/osc/rdma/osc_rdma_lock.h @@ -5,6 +5,8 @@ * Copyright (c) 2019 Triad National Security, LLC. All rights * reserved. * Copyright (c) 2021 Google, LLC. All rights reserved. + * Copyright (c) 2022 Amazon.com, Inc. or its affiliates. + * All Rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -17,6 +19,7 @@ #include "osc_rdma_types.h" #include "osc_rdma_frag.h" +#include "osc_rdma_btl_comm.h" static inline int ompi_osc_rdma_trylock_local (ompi_osc_rdma_atomic_lock_t *lock) { @@ -29,82 +32,6 @@ static inline void ompi_osc_rdma_unlock_local (ompi_osc_rdma_atomic_lock_t *lock (void) ompi_osc_rdma_lock_add (lock, -OMPI_OSC_RDMA_LOCK_EXCLUSIVE); } -/** - * Dummy completion function for atomic operations - */ -void ompi_osc_rdma_atomic_complete (mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint, - void *local_address, mca_btl_base_registration_handle_t *local_handle, - void *context, void *data, int status); - -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_fop (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, int op, - int64_t operand, int flags, int64_t *result, const bool wait_for_completion, - ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret = OPAL_ERROR; - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - - if (!wait_for_completion) { - /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ - pending_op->module = module; - (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); - } - - pending_op->op_result = (void *) result; - pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; - OBJ_RETAIN(pending_op); - if (cbfunc) { - pending_op->cbfunc = cbfunc; - pending_op->cbdata = cbdata; - pending_op->cbcontext = cbcontext; - } - - /* spin until the btl has accepted the operation */ - do { - if (NULL == pending_op->op_frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); - } - - if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer, - (intptr_t) address, pending_op->op_frag->handle, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); - } - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - if (OPAL_LIKELY(1 == ret)) { - *result = ((int64_t *) pending_op->op_buffer)[0]; - ret = OMPI_SUCCESS; - ompi_osc_rdma_atomic_complete (selected_btl, endpoint, pending_op->op_buffer, - pending_op->op_frag->handle, (void *) pending_op, NULL, OPAL_SUCCESS); - } else { - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - } - } else if (wait_for_completion) { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, ompi_osc_rdma_lock_t *result, @@ -114,69 +41,6 @@ static inline int ompi_osc_rdma_lock_btl_fop (ompi_osc_rdma_module_t *module, om operand, 0, result, wait_for_completion, NULL, NULL, NULL); } -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_op (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, - int op, int64_t operand, int flags, const bool wait_for_completion, - ompi_osc_rdma_pending_op_cb_fn_t cbfunc, void *cbdata, void *cbcontext) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret; - - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, - NULL, wait_for_completion, cbfunc, cbdata, cbcontext); - } - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - OBJ_RETAIN(pending_op); - if (cbfunc) { - pending_op->cbfunc = cbfunc; - pending_op->cbdata = cbdata; - pending_op->cbcontext = cbcontext; - } - - if (!wait_for_completion) { - /* NTH: need to keep track of pending ops to avoid a potential teardown problem */ - pending_op->module = module; - (void) opal_atomic_fetch_add_32 (&module->pending_ops, 1); - } - - /* spin until the btl has accepted the operation */ - do { - ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - if (OPAL_LIKELY(1 == ret)) { - if (cbfunc) { - cbfunc (cbdata, cbcontext, OMPI_SUCCESS); - } - ret = OMPI_SUCCESS; - } - } else if (wait_for_completion) { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, int op, ompi_osc_rdma_lock_t operand, const bool wait_for_completion) @@ -185,61 +49,6 @@ static inline int ompi_osc_rdma_lock_btl_op (ompi_osc_rdma_module_t *module, omp operand, 0, wait_for_completion, NULL, NULL, NULL); } -__opal_attribute_always_inline__ -static inline int ompi_osc_rdma_btl_cswap (ompi_osc_rdma_module_t *module, uint8_t btl_index, - struct mca_btl_base_endpoint_t *endpoint, uint64_t address, - mca_btl_base_registration_handle_t *address_handle, - int64_t compare, int64_t value, int flags, int64_t *result) -{ - ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); - int ret; - - pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); - assert (NULL != pending_op); - - OBJ_RETAIN(pending_op); - - pending_op->op_result = (void *) result; - pending_op->op_size = (MCA_BTL_ATOMIC_FLAG_32BIT & flags) ? 4 : 8; - - /* spin until the btl has accepted the operation */ - do { - if (NULL == pending_op->op_frag) { - ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); - } - if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_cswap (selected_btl, endpoint, pending_op->op_buffer, - address, pending_op->op_frag->handle, address_handle, compare, - value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, - NULL); - } - - if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { - break; - } - ompi_osc_rdma_progress (module); - } while (1); - - if (OPAL_SUCCESS != ret) { - if (OPAL_LIKELY(1 == ret)) { - *result = ((int64_t *) pending_op->op_buffer)[0]; - ret = OMPI_SUCCESS; - } - - /* need to release here because ompi_osc_rdma_atomic_complete was not called */ - OBJ_RELEASE(pending_op); - } else { - while (!pending_op->op_complete) { - ompi_osc_rdma_progress (module); - } - } - - OBJ_RELEASE(pending_op); - - return ret; -} - __opal_attribute_always_inline__ static inline int ompi_osc_rdma_lock_btl_cswap (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer, uint64_t address, ompi_osc_rdma_lock_t compare, ompi_osc_rdma_lock_t value, ompi_osc_rdma_lock_t *result) From 6a15883e372242205a1434a0a35b2c931dc3933d Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 4 Jan 2022 18:25:44 +0000 Subject: [PATCH 06/13] osc/rdma: Wrap calls to BTL RDMA functiions Add wrappers to call all BTL RDMA functions. This commit just adds another layer of indirection to the communication calls. However, a follow-on patch will add logic to either call the BTL RDMA functions directly (for accelerated mode) or call the BTL AM RDMA compatibility layer for alternate mode. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 7 +- ompi/mca/osc/rdma/osc_rdma_btl_comm.h | 105 ++++++++++++++++++++++-- ompi/mca/osc/rdma/osc_rdma_comm.c | 22 ++--- 3 files changed, 111 insertions(+), 23 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index faa71692250..404546f2f08 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -762,9 +762,10 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put..."); do { - ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address, - local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER, - ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL); + ret = ompi_osc_rdma_btl_put(module, peer->data_btl_index, peer->data_endpoint, + ptr, target_address, local_handle, target_handle, + len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL); if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) { break; } diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h index 3f175a20fb6..a666d77710b 100644 --- a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h @@ -26,6 +26,94 @@ void ompi_osc_rdma_atomic_complete(mca_btl_base_module_t *btl, struct mca_btl_ba void *context, void *data, int status); +static inline int +ompi_osc_rdma_btl_put(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_put(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_get(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + size_t size, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, + void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_get(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_atomic_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + uint64_t remote_address, struct mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_atomic_op(btl, endpoint, remote_address, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_atomic_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) + +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); +} + + +static inline int +ompi_osc_rdma_btl_atomic_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, + struct mca_btl_base_endpoint_t *endpoint, + void *local_address, uint64_t remote_address, + struct mca_btl_base_registration_handle_t *local_handle, + struct mca_btl_base_registration_handle_t *remote_handle, + uint64_t compare, uint64_t value, int flags, int order, + mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) +{ + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + + return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); +} + + static inline int ompi_osc_rdma_btl_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, struct mca_btl_base_endpoint_t *endpoint, uint64_t address, @@ -62,10 +150,10 @@ ompi_osc_rdma_btl_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, } if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_fop (selected_btl, endpoint, pending_op->op_buffer, - (intptr_t) address, pending_op->op_frag->handle, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); + ret = ompi_osc_rdma_btl_atomic_fop(module, btl_index, endpoint, pending_op->op_buffer, + (intptr_t) address, pending_op->op_frag->handle, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); } if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { @@ -129,9 +217,9 @@ ompi_osc_rdma_btl_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, /* spin until the btl has accepted the operation */ do { - ret = selected_btl->btl_atomic_op (selected_btl, endpoint, (intptr_t) address, address_handle, - op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, - (void *) pending_op, NULL); + ret = ompi_osc_rdma_btl_atomic_op(module, btl_index, endpoint, (intptr_t) address, address_handle, + op, operand, flags, MCA_BTL_NO_ORDER, ompi_osc_rdma_atomic_complete, + (void *) pending_op, NULL); if (OPAL_LIKELY(!ompi_osc_rdma_oor(ret))) { break; @@ -167,7 +255,6 @@ ompi_osc_rdma_btl_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, int64_t compare, int64_t value, int flags, int64_t *result) { ompi_osc_rdma_pending_op_t *pending_op; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; pending_op = OBJ_NEW(ompi_osc_rdma_pending_op_t); @@ -184,7 +271,7 @@ ompi_osc_rdma_btl_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index, ret = ompi_osc_rdma_frag_alloc (module, 8, &pending_op->op_frag, (char **) &pending_op->op_buffer); } if (NULL != pending_op->op_frag) { - ret = selected_btl->btl_atomic_cswap (selected_btl, endpoint, pending_op->op_buffer, + ret = ompi_osc_rdma_btl_atomic_cswap(module, btl_index, endpoint, pending_op->op_buffer, address, pending_op->op_frag->handle, address_handle, compare, value, flags, 0, ompi_osc_rdma_atomic_complete, (void *) pending_op, NULL); diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 0a7bf3285b2..ea8665c465f 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -20,6 +20,7 @@ #include "osc_rdma_sync.h" #include "osc_rdma_request.h" #include "osc_rdma_dynamic.h" +#include "osc_rdma_btl_comm.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" #include "opal/align.h" @@ -99,9 +100,9 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde assert (!(source_address & ALIGNMENT_MASK(btl->btl_get_alignment))); do { - ret = btl->btl_get (btl, endpoint, ptr, aligned_addr, - local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, - ompi_osc_get_data_complete, (void *) &read_complete, NULL); + ret = ompi_osc_rdma_btl_get(module, btl_index, endpoint, ptr, aligned_addr, + local_handle, source_handle, aligned_len, 0, MCA_BTL_NO_ORDER, + ompi_osc_get_data_complete, (void *) &read_complete, NULL); if (!ompi_osc_rdma_oor (ret)) { break; } @@ -447,7 +448,6 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee mca_btl_base_registration_handle_t *local_handle, size_t size, mca_btl_base_rdma_completion_fn_t cb, void *context, void *cbdata) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); int ret; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating btl put of %lu bytes to remote address %" PRIx64 ", sync " @@ -457,9 +457,9 @@ static int ompi_osc_rdma_put_real (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_pee ompi_osc_rdma_sync_rdma_inc (sync); do { - ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address, - local_handle, target_handle, size, 0, MCA_BTL_NO_ORDER, - cb, context, cbdata); + ret = ompi_osc_rdma_btl_put(module, peer->data_btl_index, peer->data_endpoint, + ptr, target_address, local_handle, target_handle, + size, 0, MCA_BTL_NO_ORDER, cb, context, cbdata); if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } @@ -706,10 +706,10 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p } do { - ret = btl->btl_get (btl, peer->data_endpoint, ptr, - aligned_source_base, local_handle, source_handle, - aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_get_complete, - request, frag); + ret = ompi_osc_rdma_btl_get(module, peer->data_btl_index, peer->data_endpoint, + ptr, aligned_source_base, local_handle, source_handle, + aligned_len, 0, MCA_BTL_NO_ORDER, + ompi_osc_rdma_get_complete, request, frag); if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } From 75e36e5e4dbfe0f211f1321224e9a1df92caf970 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Mon, 10 Jan 2022 23:58:48 +0000 Subject: [PATCH 07/13] osc/rdma: Split btl storage based on mode Split the storage of btls based on whether the window is using accelerated or alternate btls. This makes it more obvious when the code has made assumptions about mode that may not be true (such as the memory registration calls throughout the code that assumed selected_btl[0] was the one true BTL). Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma.h | 63 ++++++++++--------- ompi/mca/osc/rdma/osc_rdma_component.c | 74 +++++++++++++--------- ompi/mca/osc/rdma/osc_rdma_dynamic.c | 3 +- ompi/mca/osc/rdma/osc_rdma_module.c | 4 +- ompi/mca/osc/rdma/osc_rdma_peer.c | 85 ++++++++++++++++++-------- 5 files changed, 144 insertions(+), 85 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 2a8aeae156d..c052ce11a23 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -57,8 +57,6 @@ #define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count) -#define MCA_OSC_RDMA_BTLS_SIZE_INIT 4 - enum { OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, OMPI_OSC_RDMA_LOCKING_ON_DEMAND, @@ -260,14 +258,23 @@ struct ompi_osc_rdma_module_t { /** lock for peer hash table/array */ opal_mutex_t peer_lock; - - /** BTL(s) in use. Currently this is only used to support RDMA emulation over - * non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this - * could be used to support multiple RDMA-capable BTLs but the memory registration - * paths will need to be updated to pack/unpack multiple registration handles. */ - struct mca_btl_base_module_t **selected_btls; - uint8_t selected_btls_size; - uint8_t btls_in_use; + /* we currently support two modes of operation, a single + * accelerated btl (which can use memory registration and can use + * btl_flush() and one or more alternate btls, which cannot use + * flush() or rely on memory registration. Since it is an + * either/or situation, we use a union to simplify the code. + */ + bool use_accelerated_btl; + + union { + struct { + struct mca_btl_base_module_t *accelerated_btl; + }; + struct { + struct mca_btl_base_module_t **alternate_btls; + uint8_t alternate_btl_count; + }; + }; /** Only true if one BTL is in use. Memory registration is only supported when * using a single BTL. */ @@ -383,10 +390,11 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file) { if (module->use_memory_registration) { + assert(module->use_accelerated_btl); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)", ptr, (void*)((char *) ptr + size), size); - *handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags); + *handle = module->accelerated_btl->btl_register_mem(module->accelerated_btl, endpoint, ptr, size, flags); if (OPAL_UNLIKELY(NULL == *handle)) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, " "size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line); @@ -404,7 +412,9 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file) { if (handle) { - module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle); + assert(module->use_memory_registration); + assert(module->use_accelerated_btl); + module->accelerated_btl->btl_deregister_mem(module->accelerated_btl, handle); } } @@ -536,10 +546,11 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module) { #if defined(BTL_VERSION) && (BTL_VERSION >= 310) - return !!(module->selected_btls[0]->btl_flush); -#else - return false; + if (module->use_accelerated_btl) { + return (NULL != module->accelerated_btl->btl_flush); + } #endif + return false; } /** @@ -601,13 +612,13 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync) opal_progress (); } while (ompi_osc_rdma_sync_get_count (sync)); #else - mca_btl_base_module_t *btl_module = sync->module->selected_btls[0]; - do { if (!ompi_osc_rdma_use_btl_flush (sync->module)) { opal_progress (); } else { - btl_module->btl_flush (btl_module, NULL); + assert(sync->module->use_accelerated_btl); + mca_btl_base_module_t *btl_module = sync->module->accelerated_btl; + btl_module->btl_flush(btl_module, NULL); } } while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1))); #endif @@ -637,17 +648,13 @@ static inline bool ompi_osc_rdma_oor (int rc) __opal_attribute_always_inline__ static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) { - return module->selected_btls[btl_index]; -} - -__opal_attribute_always_inline__ -static inline void ompi_osc_rdma_selected_btl_insert (ompi_osc_rdma_module_t *module, struct mca_btl_base_module_t *btl, uint8_t btl_index) { - if(btl_index == module->selected_btls_size) { - module->selected_btls_size *= 2; - module->selected_btls = realloc(module->selected_btls, module->selected_btls_size * sizeof(struct mca_btl_base_module_t *)); - assert(NULL != module->selected_btls); + if (module->use_accelerated_btl) { + assert(0 == btl_index); + return module->accelerated_btl; + } else { + assert(btl_index < module->alternate_btl_count); + return module->alternate_btls[btl_index]; } - module->selected_btls[btl_index] = btl; } #endif /* OMPI_OSC_RDMA_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index eb2f0303650..8a87bab8640 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -410,6 +410,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void region->len = size; if (module->use_memory_registration && size) { + assert(module->use_accelerated_btl); if (MPI_WIN_FLAVOR_ALLOCATE != module->flavor || NULL == module->state_handle) { ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, *base, size, MCA_BTL_REG_FLAG_ACCESS_ANY, &module->base_handle); @@ -417,9 +418,9 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void return OMPI_ERR_OUT_OF_RESOURCE; } - memcpy (region->btl_handle_data, module->base_handle, module->selected_btls[0]->btl_registration_handle_size); + memcpy (region->btl_handle_data, module->base_handle, module->accelerated_btl->btl_registration_handle_size); } else { - memcpy (region->btl_handle_data, module->state_handle, module->selected_btls[0]->btl_registration_handle_size); + memcpy (region->btl_handle_data, module->state_handle, module->accelerated_btl->btl_registration_handle_size); } } @@ -580,8 +581,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s module->use_cpu_atomics = module->single_node; if (!module->single_node) { - for (int i = 0 ; i < module->btls_in_use ; ++i) { - module->use_cpu_atomics = module->use_cpu_atomics && !!(module->selected_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + if (module->use_accelerated_btl) { + module->use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + } else { + for (int i = 0 ; i < module->alternate_btl_count ; ++i) { + module->use_cpu_atomics &= !!(module->alternate_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + } } } @@ -703,14 +708,16 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s if (0 == local_rank) { /* unlink the shared memory backing file */ opal_shmem_unlink (&module->seg_ds); - /* just go ahead and register the whole segment */ - ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, - MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle); - if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { - state_region->base = (intptr_t) module->segment_base; - if (module->state_handle) { - memcpy (state_region->btl_handle_data, module->state_handle, - module->selected_btls[0]->btl_registration_handle_size); + if (module->use_accelerated_btl) { + /* just go ahead and register the whole segment */ + ret = ompi_osc_rdma_register(module, MCA_BTL_ENDPOINT_ANY, module->segment_base, total_size, + MCA_BTL_REG_FLAG_ACCESS_ANY, &module->state_handle); + if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { + state_region->base = (intptr_t) module->segment_base; + if (module->state_handle) { + memcpy(state_region->btl_handle_data, module->state_handle, + module->accelerated_btl->btl_registration_handle_size); + } } } } @@ -730,8 +737,9 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s region->base = state_region->base + my_base_offset; region->len = size; if (module->use_memory_registration) { - memcpy (region->btl_handle_data, state_region->btl_handle_data, - module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy(region->btl_handle_data, state_region->btl_handle_data, + module->accelerated_btl->btl_registration_handle_size); } } @@ -910,12 +918,23 @@ static int btl_latency_sort_fn(const void *a, const void *b) */ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { + size_t btl_count; + size_t index = 0; mca_btl_base_selected_module_t *item; int ret; assert(NULL != module); - module->btls_in_use = 0; + btl_count = opal_list_get_size(&mca_btl_base_modules_initialized); + if (btl_count > UINT8_MAX) { + return OMPI_ERROR; + } + + module->alternate_btl_count = btl_count; + module->alternate_btls = malloc(sizeof(struct mca_btl_base_module_t *) * btl_count); + if (NULL == module->alternate_btls) { + return OMPI_ERR_TEMP_OUT_OF_RESOURCE; + } /* add all alternate btls to the selected_btls list, not worrying about ordering yet. We have to add all btls unless we want to @@ -937,17 +956,17 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o if (OMPI_SUCCESS != ret) { return ret; } - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); + module->alternate_btls[index++] = item->btl_module; } + assert(index == btl_count); /* sort based on latency, lowest first */ - qsort(module->selected_btls, module->btls_in_use, + qsort(module->alternate_btls, module->alternate_btl_count, sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); - /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */ module->use_memory_registration = false; - return module->btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; + return OMPI_SUCCESS; } @@ -991,8 +1010,7 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi assert(NULL != module); - ompi_osc_rdma_selected_btl_insert(module, NULL, 0); - module->btls_in_use = 0; + module->use_accelerated_btl = false; module->use_memory_registration = false; /* Check for BTLs in the list of BTLs we know can reach all peers @@ -1106,8 +1124,8 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi } btl_selection_complete: - ompi_osc_rdma_selected_btl_insert(module, selected_btl, 0); - module->btls_in_use = 1; + module->use_accelerated_btl = true; + module->accelerated_btl = selected_btl; module->use_memory_registration = selected_btl->btl_register_mem != NULL; opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, @@ -1152,7 +1170,8 @@ static int ompi_osc_rdma_share_data (ompi_osc_rdma_module_t *module) my_data->len = (osc_rdma_size_t) my_rank; if (module->use_memory_registration && module->state_handle) { - memcpy (my_data->btl_handle_data, module->state_handle, module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy (my_data->btl_handle_data, module->state_handle, module->accelerated_btl->btl_registration_handle_size); } /* gather state data at each node leader */ @@ -1326,9 +1345,6 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->acc_use_amo = mca_osc_rdma_component.acc_use_amo; module->network_amo_max_count = mca_osc_rdma_component.network_amo_max_count; - module->selected_btls_size = MCA_OSC_RDMA_BTLS_SIZE_INIT; - module->selected_btls = calloc(module->selected_btls_size, sizeof(struct mca_btl_base_module_t *)); - module->all_sync.module = module; module->flavor = flavor; @@ -1386,6 +1402,7 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, } /* find rdma capable endpoints */ + module->use_accelerated_btl = false; ret = ompi_osc_rdma_query_accelerated_btls (module->comm, module); if (OMPI_SUCCESS != ret) { opal_output_verbose(MCA_BASE_VERBOSE_WARN, ompi_osc_base_framework.framework_output, @@ -1404,7 +1421,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, module->region_size = sizeof (ompi_osc_rdma_region_t); if (module->use_memory_registration) { - module->region_size += module->selected_btls[0]->btl_registration_handle_size; + assert(module->use_accelerated_btl); + module->region_size += module->accelerated_btl->btl_registration_handle_size; } module->state_size = sizeof (ompi_osc_rdma_state_t); diff --git a/ompi/mca/osc/rdma/osc_rdma_dynamic.c b/ompi/mca/osc/rdma/osc_rdma_dynamic.c index 8adfa7f8159..61e14fea56c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_dynamic.c +++ b/ompi/mca/osc/rdma/osc_rdma_dynamic.c @@ -252,7 +252,8 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len) return OMPI_ERR_RMA_ATTACH; } - memcpy (region->btl_handle_data, handle, module->selected_btls[0]->btl_registration_handle_size); + assert(module->use_accelerated_btl); + memcpy(region->btl_handle_data, handle, module->accelerated_btl->btl_registration_handle_size); rdma_region_handle->btl_handle = handle; } else { rdma_region_handle->btl_handle = NULL; diff --git a/ompi/mca/osc/rdma/osc_rdma_module.c b/ompi/mca/osc/rdma/osc_rdma_module.c index 933baf00694..648b61414e6 100644 --- a/ompi/mca/osc/rdma/osc_rdma_module.c +++ b/ompi/mca/osc/rdma/osc_rdma_module.c @@ -144,7 +144,9 @@ int ompi_osc_rdma_free(ompi_win_t *win) free (module->outstanding_lock_array); mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module, module->free_after); - free (module->selected_btls); + if (!module->use_accelerated_btl) { + free(module->alternate_btls); + } free (module); return OMPI_SUCCESS; diff --git a/ompi/mca/osc/rdma/osc_rdma_peer.c b/ompi/mca/osc/rdma/osc_rdma_peer.c index c6689d78812..6286fedeb69 100644 --- a/ompi/mca/osc/rdma/osc_rdma_peer.c +++ b/ompi/mca/osc/rdma/osc_rdma_peer.c @@ -40,43 +40,74 @@ static int ompi_osc_rdma_peer_btl_endpoint (struct ompi_osc_rdma_module_t *modul struct mca_btl_base_endpoint_t **endpoint) { ompi_proc_t *proc = ompi_comm_peer_lookup (module->comm, peer_id); - mca_bml_base_endpoint_t *bml_endpoint; - int num_btls; - - /* for now just use the bml to get the btl endpoint */ - bml_endpoint = mca_bml_base_get_endpoint (proc); - - num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_rdma); - - for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { - for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { - if (bml_endpoint->btl_rdma.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; - *endpoint = bml_endpoint->btl_rdma.bml_btls[btl_index].btl_endpoint; - return OMPI_SUCCESS; - } + mca_bml_base_endpoint_t *bml_endpoint = mca_bml_base_get_endpoint(proc); + + if (module->use_accelerated_btl) { + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d: accelerated btl search for peer %d", + ompi_comm_rank(module->comm), peer_id); + mca_bml_base_btl_t *bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, + module->accelerated_btl); + if (NULL != bml_btl) { + *btl_index_out = 0; + *endpoint = bml_btl->btl_endpoint; + + return OMPI_SUCCESS; } - } + } else { + mca_bml_base_btl_t *bml_btl; + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d: alternate btl search for peer %d", + ompi_comm_rank(module->comm), peer_id); + + /* the non accelerated case is a bit difficult compared to the + * accelerated case. The right BTL could be in either the + * rdma or eager endpoint list, because we're using the am + * rdma interface to provide RDMA semantics. The important + * part is that we search the alternate_btls list in order, + * since it is sorted by latency. + */ + for (int osc_btl_idx = 0 ; osc_btl_idx < module->alternate_btl_count ; ++osc_btl_idx) { + mca_btl_base_module_t *search_btl = ompi_osc_rdma_selected_btl(module, osc_btl_idx); + const char *source = NULL; + + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d comparing with btl %s, %d", + ompi_comm_rank(module->comm), + search_btl->btl_component->btl_version.mca_component_name, + osc_btl_idx); + + source = "rdma"; + bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_rdma, search_btl); + if (NULL == bml_btl) { + source = "eager"; + bml_btl = mca_bml_base_btl_array_find(&bml_endpoint->btl_eager, search_btl); + } + if (NULL != bml_btl) { + *btl_index_out = osc_btl_idx; + *endpoint = bml_btl->btl_endpoint; - /* if this is a non-RDMA btl then the endpoint may be listed under eager */ - num_btls = mca_bml_base_btl_array_get_size (&bml_endpoint->btl_eager); + opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, + "rank %d found btl for peer %d (%s, %d, %s)", + ompi_comm_rank(module->comm), peer_id, + bml_btl->btl->btl_component->btl_version.mca_component_name, + osc_btl_idx, source); - for (int module_btl_index = 0 ; module_btl_index < module->btls_in_use ; ++module_btl_index) { - for (int btl_index = 0 ; btl_index < num_btls ; ++btl_index) { - if (bml_endpoint->btl_eager.bml_btls[btl_index].btl == module->selected_btls[module_btl_index]) { - *btl_index_out = module_btl_index; - *endpoint = bml_endpoint->btl_eager.bml_btls[btl_index].btl_endpoint; return OMPI_SUCCESS; } } } + opal_output_verbose(MCA_BASE_VERBOSE_ERROR, ompi_osc_base_framework.framework_output, + "rank %d: failed peer search for peer %d", + ompi_comm_rank(module->comm), peer_id); + /* unlikely but can happen when creating a peer for self */ return OMPI_ERR_UNREACH; } int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, ompi_osc_rdma_peer_t **peer_out) { - struct mca_btl_base_endpoint_t *endpoint; + struct mca_btl_base_endpoint_t *endpoint = NULL; ompi_osc_rdma_peer_t *peer; uint8_t module_btl_index = UINT8_MAX; @@ -84,8 +115,7 @@ int ompi_osc_rdma_new_peer (struct ompi_osc_rdma_module_t *module, int peer_id, /* find a btl/endpoint to use for this peer */ int ret = ompi_osc_rdma_peer_btl_endpoint (module, peer_id, &module_btl_index, &endpoint); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret && !((module->selected_btls[0]->btl_atomic_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB) && - peer_id == ompi_comm_rank (module->comm)))) { + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } @@ -134,7 +164,8 @@ static int ompi_osc_rdma_peer_setup (ompi_osc_rdma_module_t *module, ompi_osc_rd OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "configuring peer for rank %d", peer->rank); if (module->use_memory_registration) { - registration_handle_size = module->selected_btls[0]->btl_registration_handle_size; + assert(module->use_accelerated_btl); + registration_handle_size = module->accelerated_btl->btl_registration_handle_size; } /* each node is responsible for holding a part of the rank -> node/local rank mapping array. this code From 222b7f3f23202597ff925173f6140c1260e720c2 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 20:09:36 +0000 Subject: [PATCH 08/13] osc/rdma: Remove duplicate op emulation check ompi_osc_rdma_btl_op() already includes a check to emulate a remote op with a remote fetch-and-op, so there's no need for a second check in ompi_osc_rdma_acc_single_atomic(). Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 404546f2f08..120ffbf42d3 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -244,12 +244,6 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo int btl_op, flags; int64_t origin; - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { - /* btl put atomics not supported or disabled. fall back on fetch-and-op */ - return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle, - op, req); - } - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { From 6e2af1887ed899eabb60f23a6b8fa31666cc47fc Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 20:25:41 +0000 Subject: [PATCH 09/13] osc/rdma: Remove usage of custom min() function Use the opal_min() function instead of a custom macro. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 7 +++++-- ompi/mca/osc/rdma/osc_rdma_comm.c | 7 +++++-- ompi/mca/osc/rdma/osc_rdma_comm.h | 1 - 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 120ffbf42d3..41d204ed059 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -19,12 +19,15 @@ * $HEADER$ */ +#include "ompi_config.h" + #include "osc_rdma_accumulate.h" #include "osc_rdma_request.h" #include "osc_rdma_comm.h" #include "osc_rdma_lock.h" #include "osc_rdma_btl_comm.h" +#include "opal/util/minmax.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/base/osc_base_obj_convert.h" @@ -583,9 +586,9 @@ static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const v /* determine how much to put in this operation */ if (source_count) { - acc_len = min(min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit); + acc_len = opal_min(opal_min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len), acc_limit); } else { - acc_len = min(target_iovec[target_iov_index].iov_len, acc_limit); + acc_len = opal_min(target_iovec[target_iov_index].iov_len, acc_limit); } if (0 != acc_len) { diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index ea8665c465f..17448892870 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -15,6 +15,8 @@ * $HEADER$ */ +#include "ompi_config.h" + #include "osc_rdma_comm.h" #include "osc_rdma_frag.h" #include "osc_rdma_sync.h" @@ -22,8 +24,9 @@ #include "osc_rdma_dynamic.h" #include "osc_rdma_btl_comm.h" -#include "ompi/mca/osc/base/osc_base_obj_convert.h" #include "opal/align.h" +#include "opal/util/minmax.h" +#include "ompi/mca/osc/base/osc_base_obj_convert.h" /* helper functions */ static inline void ompi_osc_rdma_cleanup_rdma (ompi_osc_rdma_sync_t *sync, bool dec_always, ompi_osc_rdma_frag_t *frag, @@ -246,7 +249,7 @@ static int ompi_osc_rdma_master_noncontig (ompi_osc_rdma_sync_t *sync, void *loc assert (0 != local_iov_count); /* determine how much to transfer in this operation */ - rdma_len = min(min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len); + rdma_len = opal_min(opal_min(local_iovec[local_iov_index].iov_len, remote_iovec[remote_iov_index].iov_len), max_rdma_len); /* execute the get */ if (!subreq && alloc_reqs) { diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.h b/ompi/mca/osc/rdma/osc_rdma_comm.h index 7b722c4fe69..82c1e873263 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_comm.h @@ -22,7 +22,6 @@ #define OMPI_OSC_RDMA_DECODE_MAX 64 -#define min(a,b) ((a) < (b) ? (a) : (b)) #define ALIGNMENT_MASK(x) ((x) ? (x) - 1 : 0) /** From 94fed3b47278b5b540058f88a62931d3ac3f7d3b Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 20:39:01 +0000 Subject: [PATCH 10/13] osc/rdma: Clean up use_cpu_atomics behaivor The use_cpu_atomics member of the rdma module was only used during initialization, so there was no reason for it to be a module member. Also clean up the initialization logic for the field (and therefore, whether or not the shared state optimization is used). Previously, it appeared to be possible to use with alternate btls across multiple nodes (ie, we tested the GLOB value on the btls), but the use_cpu_atomics field was initialized to false in the multi-node case and so they would never actually be enabled. Make that more obvious, rather than try to enable the optimization, to reduce the risk of introducing new datapath bugs. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma.h | 3 -- ompi/mca/osc/rdma/osc_rdma_component.c | 40 +++++++++++++++----------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index c052ce11a23..c70dacd4b0d 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -148,9 +148,6 @@ struct ompi_osc_rdma_module_t { /** value of same_size info key for this window */ bool same_size; - /** CPU atomics can be used */ - bool use_cpu_atomics; - /** passive-target synchronization will not be used in this window */ bool no_locks; diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 8a87bab8640..25dbd7bb0fe 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -427,7 +427,7 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void return OMPI_SUCCESS; } -static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size) +static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, size_t size, bool use_cpu_atomics) { size_t total_size, local_rank_array_size, leader_peer_data_size, base_data_size; ompi_osc_rdma_peer_t *my_peer; @@ -507,7 +507,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; my_peer->state = (uint64_t) (uintptr_t) module->state; - if (module->use_cpu_atomics) { + if (use_cpu_atomics) { /* all peers are local or it is safe to mix cpu and nic atomics */ my_peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; } else { @@ -526,7 +526,7 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = size; } - if (!module->use_cpu_atomics) { + if (!use_cpu_atomics) { if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { /* base is local and cpu atomics are available */ ex_peer->super.base_handle = module->state_handle; @@ -570,6 +570,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s struct _local_data *temp; char *data_file; size_t memory_alignment = module->memory_alignment; + bool use_cpu_atomics; shared_comm = module->shared_comm; @@ -578,21 +579,26 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s /* CPU atomics can be used if every process is on the same node or the NIC allows mixing CPU and NIC atomics */ module->single_node = local_size == global_size; - module->use_cpu_atomics = module->single_node; - if (!module->single_node) { - if (module->use_accelerated_btl) { - module->use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); - } else { - for (int i = 0 ; i < module->alternate_btl_count ; ++i) { - module->use_cpu_atomics &= !!(module->alternate_btls[i]->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); - } - } + if (module->single_node) { + use_cpu_atomics = true; + } else if (module->use_accelerated_btl) { + use_cpu_atomics = !!(module->accelerated_btl->btl_flags & MCA_BTL_ATOMIC_SUPPORTS_GLOB); + } else { + /* using the shared state optimization that is enabled by + * being able to use cpu atomics was never enabled for + * alternate btls, due to a previous bug in the enablement + * logic when alternate btls were first supported. It is + * likely that this optimization could work with sufficient + * testing, but for now, always disable to not introduce new + * correctness risks. + */ + use_cpu_atomics = false; } if (1 == local_size) { /* no point using a shared segment if there are no other processes on this node */ - return allocate_state_single (module, base, size); + return allocate_state_single (module, base, size, use_cpu_atomics); } opal_output_verbose(MCA_BASE_VERBOSE_TRACE, ompi_osc_base_framework.framework_output, @@ -771,7 +777,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer = (ompi_osc_rdma_peer_extended_t *) peer; /* set up peer state */ - if (module->use_cpu_atomics) { + if (use_cpu_atomics) { /* all peers are local or it is safe to mix cpu and nic atomics */ peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_STATE; peer->state = (osc_rdma_counter_t) peer_state; @@ -796,7 +802,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s } if (MPI_WIN_FLAVOR_DYNAMIC != module->flavor && MPI_WIN_FLAVOR_CREATE != module->flavor && - !module->use_cpu_atomics && temp[i].size && i > 0) { + !use_cpu_atomics && temp[i].size && i > 0) { /* use the local leader's endpoint */ peer->data_endpoint = local_leader->data_endpoint; peer->data_btl_index = local_leader->data_btl_index; @@ -805,7 +811,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ompi_osc_module_add_peer (module, peer); if (MPI_WIN_FLAVOR_DYNAMIC == module->flavor) { - if (module->use_cpu_atomics && peer_rank == my_rank) { + if (use_cpu_atomics && peer_rank == my_rank) { peer->flags |= OMPI_OSC_RDMA_PEER_LOCAL_BASE; } /* nothing more to do */ @@ -821,7 +827,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s ex_peer->size = temp[i].size; } - if (module->use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { + if (use_cpu_atomics && (MPI_WIN_FLAVOR_ALLOCATE == module->flavor || peer_rank == my_rank)) { /* base is local and cpu atomics are available */ if (MPI_WIN_FLAVOR_ALLOCATE == module->flavor) { ex_peer->super.base = (uintptr_t) module->segment_base + offset; From 4080d5d29ff4e9ee225f7b4937db89bac6aa1f8f Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 11 Jan 2022 00:06:11 +0000 Subject: [PATCH 11/13] osc/rdma: Use BTL am-rdma explicit interface Switch from using the implicit BTL interface (where the am-rdma interface just extends missing functionality in the BTL) to the new explicit interface (where the OSC RDMA interface is the only maintainer of the BTL list. With this change, alternate BTLs do not have to support REMOTE_COMPLETION to be selected (because the AM RDMA interface always provides remote completion when we request it, as this patch does). Any BTL that supports Active Messages (ie, all of them) should be able to support the OSC RDMA required semantics, eliminating the problem of creating windows with no servicable BTLs. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma.h | 29 +++- ompi/mca/osc/rdma/osc_rdma_accumulate.c | 40 +++--- ompi/mca/osc/rdma/osc_rdma_btl_comm.h | 77 ++++++++--- ompi/mca/osc/rdma/osc_rdma_comm.c | 174 ++++++++++++------------ ompi/mca/osc/rdma/osc_rdma_component.c | 61 +++++++-- ompi/mca/osc/rdma/osc_rdma_module.c | 5 +- 6 files changed, 239 insertions(+), 147 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index c70dacd4b0d..94b6cb641c8 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -44,6 +44,7 @@ #include "ompi/mca/osc/osc.h" #include "ompi/mca/osc/base/base.h" #include "opal/mca/btl/btl.h" +#include "opal/mca/btl/base/btl_base_am_rdma.h" #include "ompi/memchecker.h" #include "ompi/op/op.h" #include "opal/align.h" @@ -255,6 +256,8 @@ struct ompi_osc_rdma_module_t { /** lock for peer hash table/array */ opal_mutex_t peer_lock; + /* ******************* communication *********************** */ + /* we currently support two modes of operation, a single * accelerated btl (which can use memory registration and can use * btl_flush() and one or more alternate btls, which cannot use @@ -265,18 +268,27 @@ struct ompi_osc_rdma_module_t { union { struct { - struct mca_btl_base_module_t *accelerated_btl; + mca_btl_base_module_t *accelerated_btl; }; struct { - struct mca_btl_base_module_t **alternate_btls; + mca_btl_base_am_rdma_module_t **alternate_am_rdmas; uint8_t alternate_btl_count; }; }; - /** Only true if one BTL is in use. Memory registration is only supported when - * using a single BTL. */ + /** Does the selected BTL require memory registration? This field + will be false when alternate BTLs are used, and the value + when an accelerated BTL is used depends on the registration + requirements of the underlying BTL. */ bool use_memory_registration; + size_t put_alignment; + size_t get_alignment; + size_t put_limit; + size_t get_limit; + + uint32_t atomic_flags; + /** registered fragment used for locally buffered RDMA transfers */ struct ompi_osc_rdma_frag_t *rdma_frag; @@ -650,8 +662,15 @@ static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_m return module->accelerated_btl; } else { assert(btl_index < module->alternate_btl_count); - return module->alternate_btls[btl_index]; + return module->alternate_am_rdmas[btl_index]->btl; } } + +static inline mca_btl_base_am_rdma_module_t *ompi_osc_rdma_selected_am_rdma(ompi_osc_rdma_module_t *module, uint8_t btl_index) { + assert(!module->use_accelerated_btl); + assert(btl_index < module->alternate_btl_count); + return module->alternate_am_rdmas[btl_index]; +} + #endif /* OMPI_OSC_RDMA_H */ diff --git a/ompi/mca/osc/rdma/osc_rdma_accumulate.c b/ompi/mca/osc/rdma/osc_rdma_accumulate.c index 41d204ed059..0cec49a1d80 100644 --- a/ompi/mca/osc/rdma/osc_rdma_accumulate.c +++ b/ompi/mca/osc/rdma/osc_rdma_accumulate.c @@ -164,13 +164,11 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || - (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { return OMPI_ERR_NOT_SUPPORTED; } @@ -242,13 +240,11 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo ompi_op_t *op, ompi_osc_rdma_request_t *req) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = selected_btl->btl_atomic_flags; int btl_op, flags; int64_t origin; - if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags) && 4 == extent)) || - (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & atomic_flags)) || + if ((8 != extent && !((MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags) && 4 == extent)) || + (!(OMPI_DATATYPE_FLAG_DATA_INT & dt->super.flags) && !(MCA_BTL_ATOMIC_SUPPORTS_FLOAT & module->atomic_flags)) || !ompi_op_is_intrinsic (op) || (0 == ompi_osc_rdma_op_mapping[op->op_type])) { return OMPI_ERR_NOT_SUPPORTED; } @@ -663,13 +659,11 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - int32_t atomic_flags = btl->btl_atomic_flags; const size_t size = datatype->super.size; int64_t compare, source; int flags, ret; - if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & atomic_flags))) { + if (8 != size && !(4 == size && (MCA_BTL_ATOMIC_SUPPORTS_32BIT & module->atomic_flags))) { return OMPI_ERR_NOT_SUPPORTED; } @@ -717,7 +711,6 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, mca_btl_base_registration_handle_t *target_handle, bool lock_acquired) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); unsigned long len = datatype->super.size; mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -742,18 +735,21 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr, return OMPI_SUCCESS; } - if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) { - do { - ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { - break; - } + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); + if (len > btl->btl_put_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc(module, len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { + break; + } - ompi_osc_rdma_progress (module); - } while (1); + ompi_osc_rdma_progress (module); + } while (1); - memcpy (ptr, source_addr, len); - local_handle = frag->handle; + memcpy(ptr, source_addr, len); + local_handle = frag->handle; + } } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put..."); diff --git a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h index a666d77710b..718036d0a76 100644 --- a/ompi/mca/osc/rdma/osc_rdma_btl_comm.h +++ b/ompi/mca/osc/rdma/osc_rdma_btl_comm.h @@ -36,11 +36,17 @@ ompi_osc_rdma_btl_put(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_put(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, size, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_put(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_put(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -54,11 +60,18 @@ ompi_osc_rdma_btl_get(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - return btl->btl_get(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, size, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_get(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_get(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, size, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -71,6 +84,9 @@ ompi_osc_rdma_btl_atomic_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, { mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + /* the AM BTL interface does not currently support op calls */ + assert(module->use_accelerated_btl); + return btl->btl_atomic_op(btl, endpoint, remote_address, remote_handle, op, operand, flags, order, cbfunc, cbcontext, cbdata); @@ -87,12 +103,19 @@ ompi_osc_rdma_btl_atomic_fop(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, - op, operand, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_atomic_fop(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_atomic_fop(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, + op, operand, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -105,12 +128,19 @@ ompi_osc_rdma_btl_atomic_cswap(ompi_osc_rdma_module_t *module, uint8_t btl_index uint64_t compare, uint64_t value, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); - - return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, - local_handle, remote_handle, - compare, value, flags, order, - cbfunc, cbcontext, cbdata); + if (module->use_accelerated_btl) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + return btl->btl_atomic_cswap(btl, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); + } else { + mca_btl_base_am_rdma_module_t *am_rdma = ompi_osc_rdma_selected_am_rdma(module, btl_index); + return am_rdma->am_btl_atomic_cswap(am_rdma, endpoint, local_address, remote_address, + local_handle, remote_handle, + compare, value, flags, order, + cbfunc, cbcontext, cbdata); + } } @@ -195,7 +225,10 @@ ompi_osc_rdma_btl_op(ompi_osc_rdma_module_t *module, uint8_t btl_index, mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, btl_index); int ret; - if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { + /* if using the AM RDMA interface with alternate BTLs or if the + accelerated BTL does not support atomic ops, emulate the atomic + op over a fetch and atomic op */ + if (!module->use_accelerated_btl || !(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) { return ompi_osc_rdma_btl_fop (module, btl_index, endpoint, address, address_handle, op, operand, flags, NULL, wait_for_completion, cbfunc, cbdata, cbcontext); } diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 17448892870..62f35d28d64 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -65,8 +65,7 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde struct mca_btl_base_endpoint_t *endpoint, uint64_t source_address, mca_btl_base_registration_handle_t *source_handle, void *data, size_t len) { - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, btl_index); - const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; volatile bool read_complete = false; @@ -82,25 +81,28 @@ int ompi_osc_get_data_blocking (ompi_osc_rdma_module_t *module, uint8_t btl_inde "), len: %lu (aligned: %lu)", (void *) endpoint, source_address, aligned_addr, (unsigned long) len, (unsigned long) aligned_len); - if (btl->btl_register_mem && len >= btl->btl_get_local_registration_threshold) { - do { - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { - ompi_osc_rdma_progress (module); + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, btl_index); + if (len >= btl->btl_get_local_registration_threshold) { + do { + ret = ompi_osc_rdma_frag_alloc(module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE == ret)) { + ompi_osc_rdma_progress(module); + } + } while (OMPI_ERR_OUT_OF_RESOURCE == ret); + + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); + return ret; } - } while (OMPI_ERR_OUT_OF_RESOURCE == ret); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_ERROR, "error allocating temporary buffer"); - return ret; + local_handle = frag->handle; + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr, + (void *) frag); } - - local_handle = frag->handle; - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "allocated temporary buffer %p in fragment %p", (void*)ptr, - (void *) frag); } - assert (!(source_address & ALIGNMENT_MASK(btl->btl_get_alignment))); + assert (!(source_address & ALIGNMENT_MASK(module->get_alignment))); do { ret = ompi_osc_rdma_btl_get(module, btl_index, endpoint, ptr, aligned_addr, @@ -487,7 +489,6 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *local_handle = NULL; mca_btl_base_rdma_completion_fn_t cbfunc = NULL; ompi_osc_rdma_frag_t *frag = NULL; @@ -495,16 +496,19 @@ int ompi_osc_rdma_put_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_peer_t * void *cbcontext; int ret; - if (btl->btl_register_mem && size > btl->btl_put_local_registration_threshold) { - ret = ompi_osc_rdma_frag_alloc (module, size, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - ret = ompi_osc_rdma_register (module, peer->data_endpoint, source_buffer, size, 0, &local_handle); + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, peer->data_btl_index); + if (size > btl->btl_put_local_registration_threshold) { + ret = ompi_osc_rdma_frag_alloc(module, size, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; + ret = ompi_osc_rdma_register(module, peer->data_endpoint, source_buffer, size, 0, &local_handle); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } else { + memcpy(ptr, source_buffer, size); + local_handle = frag->handle; } - } else { - memcpy (ptr, source_buffer, size); - local_handle = frag->handle; } } @@ -606,8 +610,7 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); - const size_t btl_alignment_mask = ALIGNMENT_MASK(btl->btl_get_alignment); + const size_t btl_alignment_mask = ALIGNMENT_MASK(module->get_alignment); mca_btl_base_registration_handle_t *local_handle = NULL; ompi_osc_rdma_frag_t *frag = NULL; osc_rdma_size_t aligned_len; @@ -623,70 +626,73 @@ static int ompi_osc_rdma_get_contig (ompi_osc_rdma_sync_t *sync, ompi_osc_rdma_p OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating get of %lu bytes from remote ptr %" PRIx64 " to local ptr %p", size, source_address, target_buffer); - if ((btl->btl_register_mem && size > btl->btl_get_local_registration_threshold) || - (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - - ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { - /* region is too large for a buffered read */ - size_t subsize; + if (module->use_memory_registration) { + mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl(module, peer->data_btl_index); + if (size > btl->btl_get_local_registration_threshold || + (((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { - /* remote region has the same alignment but the base is not aligned. perform a small - * buffered get of the beginning of the remote region */ - aligned_source_base = OPAL_ALIGN(source_address, btl->btl_get_alignment, osc_rdma_base_t); - subsize = (size_t) (aligned_source_base - source_address); - - ret = ompi_osc_rdma_get_partial (sync, peer, source_address, source_handle, target_buffer, subsize, request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; + ret = ompi_osc_rdma_frag_alloc(module, aligned_len, &frag, &ptr); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + if (OMPI_ERR_VALUE_OUT_OF_BOUNDS == ret) { + /* region is too large for a buffered read */ + size_t subsize; + + if ((source_address & btl_alignment_mask) && (source_address & btl_alignment_mask) == ((intptr_t) target_buffer & btl_alignment_mask)) { + /* remote region has the same alignment but the base is not aligned. perform a small + * buffered get of the beginning of the remote region */ + aligned_source_base = OPAL_ALIGN(source_address, btl->btl_get_alignment, osc_rdma_base_t); + subsize = (size_t) (aligned_source_base - source_address); + + ret = ompi_osc_rdma_get_partial(sync, peer, source_address, source_handle, target_buffer, subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + + source_address += subsize; + target_buffer = (void *) ((intptr_t) target_buffer + subsize); + size -= subsize; + + aligned_len = aligned_source_bound - aligned_source_base; } - source_address += subsize; - target_buffer = (void *) ((intptr_t) target_buffer + subsize); - size -= subsize; + if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && + (size & btl_alignment_mask)) { + /* remote region bases are aligned but the bounds are not. perform a + * small buffered get of the end of the remote region */ + aligned_len = size & ~btl_alignment_mask; + subsize = size - aligned_len; + size = aligned_len; + ret = ompi_osc_rdma_get_partial(sync, peer, source_address + aligned_len, source_handle, + (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); + if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { + return ret; + } + } + /* (remaining) user request is now correctly aligned */ + } - aligned_len = aligned_source_bound - aligned_source_base; + if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { + /* local and remote alignments differ */ + request->buffer = ptr = malloc(aligned_len); + } else { + ptr = target_buffer; } - if (!(((uint64_t) target_buffer | source_address) & btl_alignment_mask) && - (size & btl_alignment_mask)) { - /* remote region bases are aligned but the bounds are not. perform a - * small buffered get of the end of the remote region */ - aligned_len = size & ~btl_alignment_mask; - subsize = size - aligned_len; - size = aligned_len; - ret = ompi_osc_rdma_get_partial (sync, peer, source_address + aligned_len, source_handle, - (void *) ((intptr_t) target_buffer + aligned_len), subsize, request); - if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { - return ret; - } + if (NULL != ptr) { + (void)ompi_osc_rdma_register(module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, + &local_handle); } - /* (remaining) user request is now correctly aligned */ - } - if ((((uint64_t) target_buffer | size | source_address) & btl_alignment_mask)) { - /* local and remote alignments differ */ - request->buffer = ptr = malloc (aligned_len); + if (OPAL_UNLIKELY(NULL == local_handle)) { + free(request->buffer); + request->buffer = NULL; + return ret; + } } else { - ptr = target_buffer; - } - - if (NULL != ptr) { - (void) ompi_osc_rdma_register (module, peer->data_endpoint, ptr, aligned_len, MCA_BTL_REG_FLAG_LOCAL_WRITE, - &local_handle); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", + (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); + local_handle = frag->handle; } - - if (OPAL_UNLIKELY(NULL == local_handle)) { - free (request->buffer); - request->buffer = NULL; - return ret; - } - } else { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "using internal buffer %p in fragment %p for get of size %lu bytes, source address 0x%lx", - (void*)ptr, (void *) frag, (unsigned long) aligned_len, (unsigned long) aligned_source_base); - local_handle = frag->handle; } } @@ -742,7 +748,6 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi ompi_datatype_t *target_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *target_handle; uint64_t target_address; int ret; @@ -777,7 +782,7 @@ static inline int ompi_osc_rdma_put_w_req (ompi_osc_rdma_sync_t *sync, const voi return ompi_osc_rdma_master (sync, (void *) origin_addr, origin_count, origin_datatype, peer, target_address, target_handle, target_count, target_datatype, request, - btl->btl_put_limit, ompi_osc_rdma_put_contig, false); + module->put_limit, ompi_osc_rdma_put_contig, false); } static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, @@ -785,7 +790,6 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori ompi_datatype_t *source_datatype, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; - mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index); mca_btl_base_registration_handle_t *source_handle; uint64_t source_address; ptrdiff_t source_span, source_lb; @@ -818,7 +822,7 @@ static inline int ompi_osc_rdma_get_w_req (ompi_osc_rdma_sync_t *sync, void *ori return ompi_osc_rdma_master (sync, origin_addr, origin_count, origin_datatype, peer, source_address, source_handle, source_count, source_datatype, request, - btl->btl_get_limit, ompi_osc_rdma_get_contig, true); + module->get_limit, ompi_osc_rdma_get_contig, true); } int ompi_osc_rdma_put (const void *origin_addr, int origin_count, ompi_datatype_t *origin_datatype, int target_rank, ptrdiff_t target_disp, int target_count, diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 25dbd7bb0fe..93241cd72b5 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -51,6 +51,7 @@ #include "opal/util/argv.h" #include "opal/util/printf.h" #include "opal/util/sys_limits.h" +#include "opal/util/minmax.h" #if OPAL_CUDA_SUPPORT #include "opal/mca/common/cuda/common_cuda.h" #endif /* OPAL_CUDA_SUPPORT */ @@ -887,12 +888,14 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) */ static int btl_latency_sort_fn(const void *a, const void *b) { - const struct mca_btl_base_module_t *btl_a = a; - const struct mca_btl_base_module_t *btl_b = b; + const mca_btl_base_am_rdma_module_t * const *am_rdma_a_p = a; + const mca_btl_base_am_rdma_module_t * const *am_rdma_b_p = b; + const mca_btl_base_am_rdma_module_t *am_rdma_a = *am_rdma_a_p; + const mca_btl_base_am_rdma_module_t *am_rdma_b = *am_rdma_b_p; - if (btl_a->btl_latency < btl_b->btl_latency) { + if (am_rdma_a->btl->btl_latency < am_rdma_b->btl->btl_latency) { return -1; - } else if (btl_a->btl_latency == btl_b->btl_latency) { + } else if (am_rdma_a->btl->btl_latency == am_rdma_b->btl->btl_latency) { return 0; } else { return 1; @@ -931,14 +934,19 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o assert(NULL != module); + module->put_alignment = 1; + module->get_alignment = 1; + module->put_limit = SIZE_MAX; + module->get_limit = SIZE_MAX; + btl_count = opal_list_get_size(&mca_btl_base_modules_initialized); if (btl_count > UINT8_MAX) { return OMPI_ERROR; } module->alternate_btl_count = btl_count; - module->alternate_btls = malloc(sizeof(struct mca_btl_base_module_t *) * btl_count); - if (NULL == module->alternate_btls) { + module->alternate_am_rdmas = malloc(sizeof(struct mca_btl_base_am_rdma_module_t *) * module->alternate_btl_count); + if (NULL == module->alternate_am_rdmas) { return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } @@ -958,19 +966,43 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, "found alternate btl %s", item->btl_module->btl_component->btl_version.mca_component_name); - ret = mca_btl_base_am_rdma_init(item->btl_module); + + ret = opal_btl_base_am_rdma_create(item->btl_module, + MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION, + true /* no_memory_registration */, + &(module->alternate_am_rdmas[index])); if (OMPI_SUCCESS != ret) { return ret; } - module->alternate_btls[index++] = item->btl_module; + + module->put_alignment = opal_max(module->put_alignment, + module->alternate_am_rdmas[index]->am_btl_put_alignment); + module->get_alignment = opal_max(module->get_alignment, + module->alternate_am_rdmas[index]->am_btl_get_alignment); + module->put_limit = opal_min(module->put_limit, + module->alternate_am_rdmas[index]->am_btl_put_limit); + module->get_limit = opal_min(module->get_limit, + module->alternate_am_rdmas[index]->am_btl_get_limit); + + index++; } - assert(index == btl_count); + assert(index == module->alternate_btl_count); /* sort based on latency, lowest first */ - qsort(module->alternate_btls, module->alternate_btl_count, - sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); + qsort(module->alternate_am_rdmas, module->alternate_btl_count, + sizeof(module->alternate_am_rdmas[0]), btl_latency_sort_fn); module->use_memory_registration = false; + module->atomic_flags = MCA_BTL_ATOMIC_SUPPORTS_ADD | + MCA_BTL_ATOMIC_SUPPORTS_AND | + MCA_BTL_ATOMIC_SUPPORTS_OR | + MCA_BTL_ATOMIC_SUPPORTS_XOR | + MCA_BTL_ATOMIC_SUPPORTS_SWAP | + MCA_BTL_ATOMIC_SUPPORTS_MIN | + MCA_BTL_ATOMIC_SUPPORTS_MAX | + MCA_BTL_ATOMIC_SUPPORTS_32BIT | + MCA_BTL_ATOMIC_SUPPORTS_CSWAP | + MCA_BTL_ATOMIC_SUPPORTS_GLOB; return OMPI_SUCCESS; } @@ -1132,7 +1164,12 @@ static int ompi_osc_rdma_query_accelerated_btls (ompi_communicator_t *comm, ompi btl_selection_complete: module->use_accelerated_btl = true; module->accelerated_btl = selected_btl; - module->use_memory_registration = selected_btl->btl_register_mem != NULL; + module->use_memory_registration = (selected_btl->btl_register_mem != NULL); + module->put_alignment = selected_btl->btl_put_alignment; + module->get_alignment = selected_btl->btl_get_alignment; + module->put_limit = selected_btl->btl_put_limit; + module->get_limit = selected_btl->btl_get_limit; + module->atomic_flags = selected_btl->btl_atomic_flags; opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, "accelerated_query: selected btl: %s", diff --git a/ompi/mca/osc/rdma/osc_rdma_module.c b/ompi/mca/osc/rdma/osc_rdma_module.c index 648b61414e6..8a080e1e4eb 100644 --- a/ompi/mca/osc/rdma/osc_rdma_module.c +++ b/ompi/mca/osc/rdma/osc_rdma_module.c @@ -145,7 +145,10 @@ int ompi_osc_rdma_free(ompi_win_t *win) mca_mpool_base_default_module->mpool_free(mca_mpool_base_default_module, module->free_after); if (!module->use_accelerated_btl) { - free(module->alternate_btls); + for (int i = 0 ; i < module->alternate_btl_count ; ++i) { + OBJ_RELEASE(module->alternate_am_rdmas[i]); + } + free(module->alternate_am_rdmas); } free (module); From 5246b669c4debe424606fc6890fd6e37a6a2b96c Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Tue, 8 Feb 2022 16:27:06 +0000 Subject: [PATCH 12/13] osc/rdma: Lower priority to 20 The RDMA OSC component is the "general" component, similar to OB1. The OSC subsystem has gone through some priority inflation over the years, so try to clean that up. This patch lowers the RDMA OSC component priority from 101 to 20, leaving the priorities as: sm 100 ucx 60 (if API version > 1.5.0) portals 20 rdma 20 Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma_component.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 93241cd72b5..5b3a6df2bc8 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -231,7 +231,7 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach); free(description_str); - mca_osc_rdma_component.priority = 101; + mca_osc_rdma_component.priority = 20; opal_asprintf(&description_str, "Priority of the osc/rdma component (default: %d)", mca_osc_rdma_component.priority); (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "priority", description_str, From ad9fae9b171da54f984275b5b9781dd4c3212501 Mon Sep 17 00:00:00 2001 From: Brian Barrett Date: Wed, 9 Feb 2022 18:04:08 +0000 Subject: [PATCH 13/13] osc/rdma: Fix double assignment typo Fix an assignment of request onto itself, which was a harmless typo, but was responsible for Coverity issues CID 1429865 and CID 1429864. Signed-off-by: Brian Barrett --- ompi/mca/osc/rdma/osc_rdma_comm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_comm.c b/ompi/mca/osc/rdma/osc_rdma_comm.c index 62f35d28d64..569cf669cae 100644 --- a/ompi/mca/osc/rdma/osc_rdma_comm.c +++ b/ompi/mca/osc/rdma/osc_rdma_comm.c @@ -398,7 +398,7 @@ static void ompi_osc_rdma_put_complete (struct mca_btl_base_module_t *btl, struc /* the lowest bit is used as a flag indicating this put operation has a request */ if ((intptr_t) context & 0x1) { - ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); + ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); sync = request->sync; if (0 == OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1)) { @@ -429,7 +429,7 @@ static void ompi_osc_rdma_put_complete_flush (struct mca_btl_base_module_t *btl, /* the lowest bit is used as a flag indicating this put operation has a request */ if ((intptr_t) context & 0x1) { - ompi_osc_rdma_request_t *request = request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); + ompi_osc_rdma_request_t *request = (ompi_osc_rdma_request_t *) ((intptr_t) context & ~1); module = request->module; if (0 == OPAL_THREAD_ADD_FETCH32 (&request->outstanding_requests, -1)) {