Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

btl/base: add subsystem to support for active-message based RDMA/atomic emulation #8493

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 25 additions & 7 deletions ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@

#define RANK_ARRAY_COUNT(module) ((ompi_comm_size ((module)->comm) + (module)->node_count - 1) / (module)->node_count)

#define MCA_OSC_RDMA_MAX_USED_BTLS 2

enum {
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
Expand Down Expand Up @@ -106,6 +108,9 @@ struct ompi_osc_rdma_component_t {
/** Priority of the osc/rdma component */
unsigned int priority;

/** Priority of the osc/rdma component when using non-RDMA BTLs */
unsigned int alternate_priority;

/** directory where to place backing files */
char *backing_directory;

Expand Down Expand Up @@ -251,8 +256,16 @@ struct ompi_osc_rdma_module_t {
opal_mutex_t peer_lock;


/** BTL in use */
struct mca_btl_base_module_t *selected_btl;
/** BTL(s) in use. Currently this is only used to support RDMA emulation over
* non-RDMA BTLs. The typical usage is btl/sm + btl/tcp. In the future this
* could be used to support multiple RDMA-capable BTLs but the memory registration
* paths will need to be updated to pack/unpack multiple registration handles. */
struct mca_btl_base_module_t *selected_btls[MCA_OSC_RDMA_MAX_USED_BTLS];
uint8_t btls_in_use;

/** Only true if one BTL is in use. Memory registration is only supported when
* using a single BTL. */
bool use_memory_registration;

/** registered fragment used for locally buffered RDMA transfers */
struct ompi_osc_rdma_frag_t *rdma_frag;
Expand Down Expand Up @@ -363,11 +376,11 @@ static inline bool ompi_osc_rdma_in_passive_epoch (ompi_osc_rdma_module_t *modul
static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struct mca_btl_base_endpoint_t *endpoint, void *ptr,
size_t size, uint32_t flags, mca_btl_base_registration_handle_t **handle, int line, const char *file)
{
if (module->selected_btl->btl_register_mem) {
if (module->use_memory_registration) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "registering segment with btl. range: %p - %p (%lu bytes)",
ptr, (void*)((char *) ptr + size), size);

*handle = module->selected_btl->btl_register_mem (module->selected_btl, endpoint, ptr, size, flags);
*handle = module->selected_btls[0]->btl_register_mem (module->selected_btls[0], endpoint, ptr, size, flags);
if (OPAL_UNLIKELY(NULL == *handle)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "failed to register pointer with selected BTL. base: %p, "
"size: %lu. file: %s, line: %d", ptr, (unsigned long) size, file, line);
Expand All @@ -385,7 +398,7 @@ static inline int _ompi_osc_rdma_register (ompi_osc_rdma_module_t *module, struc
static inline void _ompi_osc_rdma_deregister (ompi_osc_rdma_module_t *module, mca_btl_base_registration_handle_t *handle, int line, const char *file)
{
if (handle) {
module->selected_btl->btl_deregister_mem (module->selected_btl, handle);
module->selected_btls[0]->btl_deregister_mem (module->selected_btls[0], handle);
}
}

Expand Down Expand Up @@ -517,7 +530,7 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
return !!(module->selected_btl->btl_flush);
return !!(module->selected_btls[0]->btl_flush);
#else
return false;
#endif
Expand Down Expand Up @@ -582,7 +595,7 @@ static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
opal_progress ();
} while (ompi_osc_rdma_sync_get_count (sync));
#else
mca_btl_base_module_t *btl_module = sync->module->selected_btl;
mca_btl_base_module_t *btl_module = sync->module->selected_btls[0];

do {
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
Expand Down Expand Up @@ -616,4 +629,9 @@ static inline bool ompi_osc_rdma_oor (int rc)
return (OPAL_SUCCESS != rc && (OPAL_ERR_OUT_OF_RESOURCE == rc || OPAL_ERR_TEMP_OUT_OF_RESOURCE == rc));
}

__opal_attribute_always_inline__
static inline mca_btl_base_module_t *ompi_osc_rdma_selected_btl (ompi_osc_rdma_module_t *module, uint8_t btl_index) {
return module->selected_btls[btl_index];
}

#endif /* OMPI_OSC_RDMA_H */
38 changes: 22 additions & 16 deletions ompi/mca/osc/rdma/osc_rdma_accumulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const
mca_btl_base_registration_handle_t *target_handle, ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;

Expand All @@ -160,7 +161,7 @@ static int ompi_osc_rdma_fetch_and_op_atomic (ompi_osc_rdma_sync_t *sync, const

origin = (8 == extent) ? ((int64_t *) origin_addr)[0] : ((int32_t *) origin_addr)[0];

return ompi_osc_rdma_btl_fop (module, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
return ompi_osc_rdma_btl_fop (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin, flags,
result_addr, true, NULL, NULL, NULL);
}

Expand All @@ -182,7 +183,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating fetch-and-op using compare-and-swap");

ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, address, target_handle, &old_value, 8);
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, address, target_handle, &old_value, 8);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
Expand All @@ -197,7 +198,7 @@ static int ompi_osc_rdma_fetch_and_op_cas (ompi_osc_rdma_sync_t *sync, const voi
ompi_op_reduce (op, (void *) ((intptr_t) origin_addr + dt->super.true_lb), (void*)((intptr_t) &new_value + offset), 1, dt);
}

ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, address, target_handle,
ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, address, target_handle,
old_value, new_value, 0, (int64_t*)&new_value);
if (OPAL_SUCCESS != ret || new_value == old_value) {
break;
Expand All @@ -218,11 +219,12 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
ompi_op_t *op, ompi_osc_rdma_request_t *req)
{
ompi_osc_rdma_module_t *module = sync->module;
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
mca_btl_base_module_t *selected_btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = selected_btl->btl_atomic_flags;
int btl_op, flags;
int64_t origin;

if (!(module->selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
if (!(selected_btl->btl_flags & MCA_BTL_FLAGS_ATOMIC_OPS)) {
/* btl put atomics not supported or disabled. fall back on fetch-and-op */
return ompi_osc_rdma_fetch_and_op_atomic (sync, origin_addr, NULL, dt, extent, peer, target_address, target_handle,
op, req);
Expand All @@ -248,7 +250,7 @@ static int ompi_osc_rdma_acc_single_atomic (ompi_osc_rdma_sync_t *sync, const vo
*((int64_t *) origin_addr));

/* if we locked the peer its best to wait for completion before returning */
return ompi_osc_rdma_btl_op (module, peer->data_endpoint, target_address, target_handle, btl_op, origin,
return ompi_osc_rdma_btl_op (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle, btl_op, origin,
flags, true, NULL, NULL, NULL);
}

Expand Down Expand Up @@ -359,7 +361,8 @@ static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const v
/* set up the request */
request->to_free = ptr;

ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, target_address, target_handle, ptr, len);
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint,
target_address, target_handle, ptr, len);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
Expand Down Expand Up @@ -644,7 +647,8 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
int32_t atomic_flags = module->selected_btl->btl_atomic_flags;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
int32_t atomic_flags = btl->btl_atomic_flags;
const size_t size = datatype->super.size;
int64_t compare, source;
int flags, ret;
Expand All @@ -660,8 +664,8 @@ static inline int ompi_osc_rdma_cas_atomic (ompi_osc_rdma_sync_t *sync, const vo
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "initiating compare-and-swap using %d-bit btl atomics. compare: 0x%"
PRIx64 ", origin: 0x%" PRIx64, (int) size * 8, *((int64_t *) compare_addr), *((int64_t *) source_addr));

ret = ompi_osc_rdma_btl_cswap (module, peer->data_endpoint, target_address, target_handle, compare, source, flags,
result_addr);
ret = ompi_osc_rdma_btl_cswap (module, peer->data_btl_index, peer->data_endpoint, target_address, target_handle,
compare, source, flags, result_addr);
if (OPAL_LIKELY(OMPI_SUCCESS == ret)) {
ompi_osc_rdma_peer_accumulate_cleanup (module, peer, lock_acquired);
}
Expand Down Expand Up @@ -696,6 +700,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
mca_btl_base_registration_handle_t *target_handle, bool lock_acquired)
{
ompi_osc_rdma_module_t *module = sync->module;
mca_btl_base_module_t *btl = ompi_osc_rdma_selected_btl (module, peer->data_btl_index);
unsigned long len = datatype->super.size;
mca_btl_base_registration_handle_t *local_handle = NULL;
ompi_osc_rdma_frag_t *frag = NULL;
Expand All @@ -708,7 +713,8 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
", sync %p", len, target_address, (void *) sync);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl get...");
ret = ompi_osc_get_data_blocking (module, peer->data_endpoint, target_address, target_handle, result_addr, len);
ret = ompi_osc_get_data_blocking (module, peer->data_btl_index, peer->data_endpoint, target_address,
target_handle, result_addr, len);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
return ret;
}
Expand All @@ -719,7 +725,7 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
return OMPI_SUCCESS;
}

if (module->selected_btl->btl_register_mem && len > module->selected_btl->btl_put_local_registration_threshold) {
if (btl->btl_register_mem && len > btl->btl_put_local_registration_threshold) {
do {
ret = ompi_osc_rdma_frag_alloc (module, len, &frag, &ptr);
if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) {
Expand All @@ -736,9 +742,9 @@ static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_addr,
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "RDMA compare-and-swap initiating blocking btl put...");

do {
ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
ret = btl->btl_put (btl, peer->data_endpoint, ptr, target_address,
local_handle, target_handle, len, 0, MCA_BTL_NO_ORDER,
ompi_osc_rdma_cas_put_complete, (void *) &complete, NULL);
if (OPAL_SUCCESS == ret || (OPAL_ERR_OUT_OF_RESOURCE != ret && OPAL_ERR_TEMP_OUT_OF_RESOURCE != ret)) {
break;
}
Expand Down
Loading