Skip to content

Commit

Permalink
osc/rdma: add support for "alternate" btls
Browse files Browse the repository at this point in the history
This commit updates osc/rdma to support using alternate BTLs when
a primary BTL is not available. There may be at most two
alternate BTLs in use at any time. The default is selected to
cover shared memory (sm) and off-node (tcp).

The priority of osc/rdma is a bit lower when using a set of
alternate btls. This will allow another osc component to win if
there is an alternative.

Signed-off-by: Nathan Hjelm <hjelmn@google.com>
  • Loading branch information
hjelmn committed Mar 2, 2021
1 parent 0d8140c commit 1cc6b78
Show file tree
Hide file tree
Showing 10 changed files with 355 additions and 160 deletions.
32 changes: 25 additions & 7 deletions ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@

#define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count)

#define MCA_OSC_RDMA_MAX_USED_BTLS 2

enum {
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
Expand Down Expand Up @@ -106,6 +108,9 @@ struct ompi_osc_rdma_component_t {
/** Priority of the osc/rdma component */
unsigned int priority;

/** Priority of the osc/rdma component when using non-RDMA BTLs */
unsigned int alternate_priority;

/** directory where to place backing files */
char *backing_directory;

Expand Down Expand Up @@ -251,8 +256,16 @@ struct ompi_osc_rdma_module_t {
opal_mutex_t peer_lock;


/** BTL in use */
struct mca_btl_base_module_t *selected_btl;
/** BTL(s) in use. Currently this is only used to support RDMA emulation over
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
* could be used to support multiple RDMA-capable BTLs but the memory registration
* paths will need to be updated to pack/unpack multiple registration handles. */
struct mca_btl_base_module_t *selected_btls[MCA_OSC_RDMA_MAX_USED_BTLS];
uint8_t btls_in_use;

/** Only true if one BTL is in use. Memory registration is only supported when
* using a single BTL. */
bool use_memory_registration;

/** registered fragment used for locally buffered RDMA transfers */
struct ompi_osc_rdma_frag_t *rdma_frag;
Expand Down Expand Up @@ -363,11 +376,11 @@ static inline bool ompi_osc_rdma_in_passive_epoch (ompi_osc_rdma_module_t *modul
static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, void *ptr,
size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file)
{
if (module->selected_btl->btl_register_mem) {
if (module->use_memory_registration) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)",
ptr, (void*)((char *) ptr + size), size);

*handle = module->selected_btl->btl_register_mem (module->selected_btl, endpoint, ptr, size, flags);
*handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags);
if (OPAL_UNLIKELY(NULL == *handle)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, "
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line);
Expand All @@ -385,7 +398,7 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
{
if (handle) {
module->selected_btl->btl_deregister_mem (module->selected_btl, handle);
module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle);
}
}

Expand Down Expand Up @@ -517,7 +530,7 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
return !!(module->selected_btl->btl_flush);
return !!(module->selected_btls[0]->btl_flush);
#else
return false;
#endif
Expand Down Expand Up @@ -582,7 +595,7 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
opal_progress ();
} while (ompi_osc_rdma_sync_get_count (sync));
#else
mca_btl_base_module_t *btl_module = sync->module->selected_btl;
mca_btl_base_module_t *btl_module = sync->module->selected_btls[0];

do {
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
Expand Down Expand Up @@ -616,4 +629,9 @@ static inline bool ompi_osc_rdma_oor (int rc)
return (OPAL_SUCCESS != rc && (OPAL_ERR_OUT_OF_RESOURCE == rc || OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc));
}

__opal_attribute_always_inline__
static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
return module->selected_btls[btl_index];
}

#endif /* OMPI_OSC_RDMA_H */
38 changes: 22 additions & 16 deletions ompi/mca/osc/rdma/osc_rdma_accumulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;

Expand All @@ -160,7 +161,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const

origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0];

return ompi_osc_rdma_btl_fop (module, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
return ompi_osc_rdma_btl_fop (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
result_addr, true, NULL, NULL, NULL);
}

Expand All @@ -182,7 +183,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap");

ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, &old_value, 8);
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, address, target_handle, &old_value, 8);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
Expand All @@ -197,7 +198,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
ompi_op_reduce (op, (void *) ((intptr_t) origin_addr + dt->super.true_lb), (void*)((intptr_t) &new_value + offset), 1, dt);
}

ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, address, target_handle,
old_value, new_value, 0, (int64_t*)&new_value);
if (OPAL_SUCCESS != ret || new_value == old_value) {
break;
Expand All @@ -218,11 +219,12 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;

if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
/* btl put atomics not supported or disabled. fall back on fetch-and-op */
return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle,
op, req);
Expand All @@ -248,7 +250,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
*((int64_t *) origin_addr));

/* if we locked the peer its best to wait for completion before returning */
return ompi_osc_rdma_btl_op (module, peer->data_endpoint, target_address, target_handle, btl_op, origin,
return ompi_osc_rdma_btl_op (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin,
flags, true, NULL, NULL, NULL);
}

Expand Down Expand Up @@ -359,7 +361,8 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
/* set up the request */
request->to_free = ptr;

ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, target_address, target_handle, ptr, len);
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint,
target_address, target_handle, ptr, len);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
Expand Down Expand Up @@ -644,7 +647,8 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = btl->btl_atomic_flags;
const size_t size = datatype->super.size;
int64_t compare, source;
int flags, ret;
Expand All @@ -660,8 +664,8 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%"
PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr));

ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, target_address, target_handle, compare, source, flags,
result_addr);
ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle,
compare, source, flags, result_addr);
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired);
}
Expand Down Expand Up @@ -696,6 +700,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
unsigned long len = datatype->super.size;
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand All @@ -708,7 +713,8 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
", sync %p", len, target_address, (void *) sync);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl get...");
ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, target_address, target_handle, result_addr, len);
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, target_address,
target_handle, result_addr, len);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
Expand All @@ -719,7 +725,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
return OMPI_SUCCESS;
}

if (module->selected_btl->btl_register_mem && len > module->selected_btl->btl_put_local_registration_threshold) {
if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) {
do {
ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
Expand All @@ -736,9 +742,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put...");

do {
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) {
break;
}
Expand Down
Loading

0 comments on commit 1cc6b78

Please sign in to comment.