diff --git a/ompi/mca/osc/rdma/osc_rdma.h b/ompi/mca/osc/rdma/osc_rdma.h index 6aed111ddf0..8e8ecd0acfd 100644 --- a/ompi/mca/osc/rdma/osc_rdma.h +++ b/ompi/mca/osc/rdma/osc_rdma.h @@ -13,6 +13,7 @@ * Copyright (c) 2010 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved. * Copyright (c) 2016-2018 Intel, Inc. All rights reserved. + * Copyright (c) 2020 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -248,7 +249,7 @@ struct ompi_osc_rdma_module_t { /** registration handles for dynamically attached regions. These are not stored * in the state structure as it is entirely local. */ - ompi_osc_rdma_handle_t *dynamic_handles; + ompi_osc_rdma_handle_t **dynamic_handles; /** shared memory segment. this segment holds this node's portion of the rank -> node * mapping array, node communication data (node_comm_info), state for all local ranks, diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index d1e99b98dd1..31838b9f757 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -20,6 +20,7 @@ * Copyright (c) 2018 Cisco Systems, Inc. All rights reserved * Copyright (c) 2019 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2020 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -217,11 +218,11 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size); free(description_str); - mca_osc_rdma_component.max_attach = 32; + mca_osc_rdma_component.max_attach = 64; asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. " "Keep in mind that each attached buffer will use a potentially limited " "resource (default: %d)", mca_osc_rdma_component.max_attach); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str, + (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str, MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach); free(description_str); @@ -1257,8 +1258,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base, if (MPI_WIN_FLAVOR_DYNAMIC == flavor) { /* allocate space to store local btl handles for attached regions */ - module->dynamic_handles = (ompi_osc_rdma_handle_t *) calloc (mca_osc_rdma_component.max_attach, - sizeof (module->dynamic_handles[0])); + module->dynamic_handles = (ompi_osc_rdma_handle_t **) calloc (mca_osc_rdma_component.max_attach, + sizeof (module->dynamic_handles[0])); if (NULL == module->dynamic_handles) { ompi_osc_rdma_free (win); return OMPI_ERR_OUT_OF_RESOURCE; diff --git a/ompi/mca/osc/rdma/osc_rdma_dynamic.c b/ompi/mca/osc/rdma/osc_rdma_dynamic.c index c1c21581e94..50a18a66048 100644 --- a/ompi/mca/osc/rdma/osc_rdma_dynamic.c +++ b/ompi/mca/osc/rdma/osc_rdma_dynamic.c @@ -2,6 +2,7 @@ /* * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2020 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -16,6 +17,22 @@ #include "opal/util/sys_limits.h" +static void ompi_osc_rdma_handle_init (ompi_osc_rdma_handle_t *rdma_handle) +{ + rdma_handle->btl_handle = NULL; + OBJ_CONSTRUCT(&rdma_handle->attachments, opal_list_t); +} + +static void ompi_osc_rdma_handle_fini (ompi_osc_rdma_handle_t *rdma_handle) +{ + OPAL_LIST_DESTRUCT(&rdma_handle->attachments); +} + +OBJ_CLASS_INSTANCE(ompi_osc_rdma_handle_t, opal_object_t, ompi_osc_rdma_handle_init, + ompi_osc_rdma_handle_fini); + +OBJ_CLASS_INSTANCE(ompi_osc_rdma_attachment_t, opal_list_item_t, NULL, NULL); + /** * ompi_osc_rdma_find_region_containing: * @@ -48,13 +65,16 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi region_bound = (intptr_t) (region->base + region->len); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)", - (void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index, - min_index, max_index); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, " + "max_index = %d)", (void *) base, (void *) bound, (void *) region->base, + (void *)(region->base + region->len), mid_index, min_index, max_index); if (region->base > base) { - return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index); - } else if (bound <= region_bound) { + return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, + region_index); + } + + if (bound <= region_bound) { if (region_index) { *region_index = mid_index; } @@ -66,24 +86,76 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi } /* binary search for insertion point */ -static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base, - size_t region_size, int *region_index) +static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, + intptr_t base, size_t region_size, int *region_index) { int mid_index = (max_index + min_index) >> 1; ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base, region_size); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base, + region_size); if (max_index < min_index) { *region_index = min_index; return (ompi_osc_rdma_region_t *)((intptr_t) regions + min_index * region_size); } - if (region->base > base) { + if (region->base > base || (region->base == base && region->len > region_size)) { return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index); - } else { - return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index); } + + return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index); +} + +static bool ompi_osc_rdma_find_conflicting_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, intptr_t bound) +{ + ompi_osc_rdma_attachment_t *attachment; + + OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) { + intptr_t region_bound = attachment->base + attachment->len; + if (base >= attachment->base && base < region_bound || + bound > attachment->base && bound <= region_bound) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "existing region {%p, %p} overlaps region {%p, %p}", + (void *) attachment->base, (void *) region_bound, (void *) base, (void *) bound); + return true; + } + } + + return false; +} + +static int ompi_osc_rdma_add_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, size_t len) +{ + ompi_osc_rdma_attachment_t *attachment = OBJ_NEW(ompi_osc_rdma_attachment_t); + assert (NULL != attachment); + + if (ompi_osc_rdma_find_conflicting_attachment(handle, base, base + len)) { + return OMPI_ERR_RMA_ATTACH; + } + + attachment->base = base; + attachment->len = len; + + opal_list_append (&handle->attachments, &attachment->super); + + return OMPI_SUCCESS; +} + +static int ompi_osc_rdma_remove_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base) +{ + ompi_osc_rdma_attachment_t *attachment; + + OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "checking attachment %p against %p", + (void *) attachment->base, (void *) base); + if (attachment->base == (intptr_t) base) { + opal_list_remove_item (&handle->attachments, &attachment->super); + OBJ_RELEASE(attachment); + return OMPI_SUCCESS; + } + } + + return OMPI_ERR_NOT_FOUND; } int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len) @@ -92,12 +164,13 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len) const int my_rank = ompi_comm_rank (module->comm); ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank); ompi_osc_rdma_region_t *region; + ompi_osc_rdma_handle_t *rdma_region_handle; osc_rdma_counter_t region_count; osc_rdma_counter_t region_id; - void *bound; + intptr_t bound, aligned_base, aligned_bound; intptr_t page_size = opal_getpagesize (); - int region_index; - int ret; + int region_index, ret; + size_t aligned_len; if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) { return OMPI_ERR_RMA_FLAVOR; @@ -117,23 +190,26 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len) if (region_count == mca_osc_rdma_component.max_attach) { OPAL_THREAD_UNLOCK(&module->lock); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: could not attach. max attachment count reached."); return OMPI_ERR_RMA_ATTACH; } /* it is wasteful to register less than a page. this may allow the remote side to access more * memory but the MPI standard covers this with calling the calling behavior erroneous */ - bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t); - base = (void *)((intptr_t) base & ~(page_size - 1)); - len = (size_t)((intptr_t) bound - (intptr_t) base); - - /* see if a matching region already exists */ - region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base, - (intptr_t) bound, module->region_size, ®ion_index); + bound = (intptr_t) base + len; + aligned_bound = OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t); + aligned_base = (intptr_t) base & ~(page_size - 1); + aligned_len = (size_t)(aligned_bound - aligned_base); + + /* see if a registered region already exists */ + region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, + aligned_base, aligned_bound, module->region_size, ®ion_index); if (NULL != region) { - ++module->dynamic_handles[region_index].refcnt; + /* validates that the region does not overlap with an existing region even if they are on the same page */ + ret = ompi_osc_rdma_add_attachment (module->dynamic_handles[region_index], (intptr_t) base, len); OPAL_THREAD_UNLOCK(&module->lock); /* no need to invalidate remote caches */ - return OMPI_SUCCESS; + return ret; } /* region is in flux */ @@ -144,45 +220,50 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len) /* do a binary seach for where the region should be inserted */ if (region_count) { - region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base, - module->region_size, ®ion_index); + region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, + (intptr_t) base, module->region_size, ®ion_index); if (region_index < region_count) { - memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size); - - if (module->selected_btl->btl_register_mem) { - memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index, - (region_count - region_index) * sizeof (module->dynamic_handles[0])); - } + memmove ((void *) ((intptr_t) region + module->region_size), region, + (region_count - region_index) * module->region_size); + memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index, + (region_count - region_index) * sizeof (module->dynamic_handles[0])); } } else { region_index = 0; region = (ompi_osc_rdma_region_t *) module->state->regions; } - region->base = (intptr_t) base; - region->len = len; + region->base = aligned_base; + region->len = aligned_len; + + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d", + base, (void *) bound, (void *) aligned_base, (void *) aligned_bound, region_index); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d", - base, (void *)((intptr_t) base + len), region_index); + /* add RDMA region handle to track this region */ + rdma_region_handle = OBJ_NEW(ompi_osc_rdma_handle_t); + assert (NULL != rdma_region_handle); if (module->selected_btl->btl_register_mem) { mca_btl_base_registration_handle_t *handle; - ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY, - &handle); + ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, + MCA_BTL_REG_FLAG_ACCESS_ANY, &handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); + OBJ_RELEASE(rdma_region_handle); return OMPI_ERR_RMA_ATTACH; } memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size); - module->dynamic_handles[region_index].btl_handle = handle; + rdma_region_handle->btl_handle = handle; } else { - module->dynamic_handles[region_index].btl_handle = NULL; + rdma_region_handle->btl_handle = NULL; } - module->dynamic_handles[region_index].refcnt = 1; + assert(OMPI_SUCCESS == ompi_osc_rdma_add_attachment (rdma_region_handle, (intptr_t) base, len)); + + module->dynamic_handles[region_index] = rdma_region_handle; #if OPAL_ENABLE_DEBUG for (int i = 0 ; i < region_count + 1 ; ++i) { @@ -211,9 +292,11 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base) ompi_osc_rdma_module_t *module = GET_MODULE(win); const int my_rank = ompi_comm_rank (module->comm); ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank); + ompi_osc_rdma_handle_t *rdma_region_handle; osc_rdma_counter_t region_count, region_id; ompi_osc_rdma_region_t *region; - int region_index; + void *bound; + int start_index = INT_MAX, region_index; if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) { return OMPI_ERR_WIN; @@ -221,24 +304,34 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base) OPAL_THREAD_LOCK(&module->lock); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach: %s, %p", win->w_name, base); - /* the upper 4 bytes of the region count are an instance counter */ region_count = module->state->region_count & 0xffffffffL; region_id = module->state->region_count >> 32; - region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, - region_count - 1, (intptr_t) base, (intptr_t) base + 1, - module->region_size, ®ion_index); - if (NULL == region) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory region starting at %p", base); - OPAL_THREAD_UNLOCK(&module->lock); - return OMPI_ERROR; + /* look up the associated region */ + for (region_index = 0 ; region_index < region_count ; ++region_index) { + rdma_region_handle = module->dynamic_handles[region_index]; + region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + region_index * module->region_size); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking attachments at index %d {.base=%p, len=%lu} for attachment %p" + ", region handle=%p", region_index, (void *) region->base, region->len, base, rdma_region_handle); + + if (region->base > (uintptr_t) base || (region->base + region->len) < (uintptr_t) base) { + continue; + } + + if (OPAL_SUCCESS == ompi_osc_rdma_remove_attachment (rdma_region_handle, (intptr_t) base)) { + break; + } } - if (--module->dynamic_handles[region_index].refcnt > 0) { + if (region_index == region_count) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory attachment for %p", base); OPAL_THREAD_UNLOCK(&module->lock); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete"); + return OMPI_ERR_BASE; + } + + if (!opal_list_is_empty (&rdma_region_handle->attachments)) { + /* another region is referencing this attachment */ return OMPI_SUCCESS; } @@ -249,21 +342,21 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base) base, (void *)((intptr_t) base + region->len), region_index); if (module->selected_btl->btl_register_mem) { - ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle); + ompi_osc_rdma_deregister (module, rdma_region_handle->btl_handle); - if (region_index < region_count - 1) { - memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1, - (region_count - region_index - 1) * sizeof (void *)); - } - - memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0])); } if (region_index < region_count - 1) { + size_t end_count = region_count - region_index - 1; + memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1, + end_count * sizeof (module->dynamic_handles[0])); memmove (region, (void *)((intptr_t) region + module->region_size), - (region_count - region_index - 1) * module->region_size);; + end_count * module->region_size); } + OBJ_RELEASE(rdma_region_handle); + module->dynamic_handles[region_count - 1] = NULL; + module->state->region_count = ((region_id + 1) << 32) | (region_count - 1); ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock)); diff --git a/ompi/mca/osc/rdma/osc_rdma_module.c b/ompi/mca/osc/rdma/osc_rdma_module.c index 40765cb2d23..90e8b439c78 100644 --- a/ompi/mca/osc/rdma/osc_rdma_module.c +++ b/ompi/mca/osc/rdma/osc_rdma_module.c @@ -14,6 +14,7 @@ * Copyright (c) 2017 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. + * Copyright (c) 2020 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -79,7 +80,9 @@ int ompi_osc_rdma_free(ompi_win_t *win) int region_count = module->state->region_count & 0xffffffffL; if (NULL != module->dynamic_handles) { for (int i = 0 ; i < region_count ; ++i) { - ompi_osc_rdma_deregister (module, module->dynamic_handles[i].btl_handle); + ompi_osc_rdma_handle_t *region_handle = module->dynamic_handles[i]; + ompi_osc_rdma_deregister (module, region_handle->btl_handle); + OBJ_RELEASE(region_handle); } free (module->dynamic_handles); diff --git a/ompi/mca/osc/rdma/osc_rdma_types.h b/ompi/mca/osc/rdma/osc_rdma_types.h index 4acb40154de..f4f96b8ff38 100644 --- a/ompi/mca/osc/rdma/osc_rdma_types.h +++ b/ompi/mca/osc/rdma/osc_rdma_types.h @@ -2,6 +2,7 @@ /* * Copyright (c) 2014-2018 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2020 Google, LLC. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -109,6 +110,21 @@ struct ompi_osc_rdma_region_t { }; typedef struct ompi_osc_rdma_region_t ompi_osc_rdma_region_t; +/** + * @brief data handle for attached memory region + * + * This structure describes an attached memory region. It is used + * to track the exact parameters passed to MPI_Win_attach to + * validate a new attachment as well as handle detach. + */ +struct ompi_osc_rdma_attachment_t { + opal_list_item_t super; + intptr_t base; + size_t len; +}; +typedef struct ompi_osc_rdma_attachment_t ompi_osc_rdma_attachment_t; +OBJ_CLASS_DECLARATION(ompi_osc_rdma_attachment_t); + /** * @brief data handle for dynamic memory regions * @@ -118,12 +134,14 @@ typedef struct ompi_osc_rdma_region_t ompi_osc_rdma_region_t; * region associated with a page (or set of pages) has been attached. */ struct ompi_osc_rdma_handle_t { + opal_object_t super; /** btl handle for the memory region */ mca_btl_base_registration_handle_t *btl_handle; - /** number of attaches assocated with this region */ - int refcnt; + /** attached regions associated with this registration */ + opal_list_t attachments; }; typedef struct ompi_osc_rdma_handle_t ompi_osc_rdma_handle_t; +OBJ_CLASS_DECLARATION(ompi_osc_rdma_handle_t); /** * @brief number of state buffers that can be used for storing