Skip to content

Commit

Permalink
osc/rdma: modify attach to check for region overlap
Browse files Browse the repository at this point in the history
This commit addresses two issues in osc/rdma:

 1) It is erroneous to attach regions that overlap. This was being
    allowed but the standard does not allow overlapping attachments.

 2) Overlapping registration regions (4k alignment of attachments)
    appear to be allowed. Add attachment bases to the bookeeping
    structure so we can keep better track of what can be detached.

It is possible that the standard did not intend to allow #2. If that
is the case then #2 should fail in the same way as #1. There should
be no technical reason to disallow #2 at this time.

References #7384

Signed-off-by: Nathan Hjelm <hjelmn@google.com>
  • Loading branch information
hjelmn committed Feb 14, 2020
1 parent 5025628 commit 96ed630
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 67 deletions.
3 changes: 2 additions & 1 deletion ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2020 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -250,7 +251,7 @@ struct ompi_osc_rdma_module_t {

/** registration handles for dynamically attached regions. These are not stored
* in the state structure as it is entirely local. */
ompi_osc_rdma_handle_t *dynamic_handles;
ompi_osc_rdma_handle_t **dynamic_handles;

/** shared memory segment. this segment holds this node's portion of the rank -> node
* mapping array, node communication data (node_comm_info), state for all local ranks,
Expand Down
7 changes: 4 additions & 3 deletions ompi/mca/osc/rdma/osc_rdma_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2020 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -218,7 +219,7 @@ static int ompi_osc_rdma_component_register (void)
MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size);
free(description_str);

mca_osc_rdma_component.max_attach = 32;
mca_osc_rdma_component.max_attach = 64;
opal_asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
"Keep in mind that each attached buffer will use a potentially limited "
"resource (default: %d)", mca_osc_rdma_component.max_attach);
Expand Down Expand Up @@ -1267,8 +1268,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,

if (MPI_WIN_FLAVOR_DYNAMIC == flavor) {
/* allocate space to store local btl handles for attached regions */
module->dynamic_handles = (ompi_osc_rdma_handle_t *) calloc (mca_osc_rdma_component.max_attach,
sizeof (module->dynamic_handles[0]));
module->dynamic_handles = (ompi_osc_rdma_handle_t **) calloc (mca_osc_rdma_component.max_attach,
sizeof (module->dynamic_handles[0]));
if (NULL == module->dynamic_handles) {
ompi_osc_rdma_free (win);
return OMPI_ERR_OUT_OF_RESOURCE;
Expand Down
216 changes: 156 additions & 60 deletions ompi/mca/osc/rdma/osc_rdma_dynamic.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/*
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -16,6 +17,22 @@

#include "opal/util/sys_limits.h"

static void ompi_osc_rdma_handle_init (ompi_osc_rdma_handle_t *rdma_handle)
{
rdma_handle->btl_handle = NULL;
OBJ_CONSTRUCT(&rdma_handle->attachments, opal_list_t);
}

static void ompi_osc_rdma_handle_fini (ompi_osc_rdma_handle_t *rdma_handle)
{
OPAL_LIST_DESTRUCT(&rdma_handle->attachments);
}

OBJ_CLASS_INSTANCE(ompi_osc_rdma_handle_t, opal_object_t, ompi_osc_rdma_handle_init,
ompi_osc_rdma_handle_fini);

OBJ_CLASS_INSTANCE(ompi_osc_rdma_attachment_t, opal_list_item_t, NULL, NULL);

/**
* ompi_osc_rdma_find_region_containing:
*
Expand Down Expand Up @@ -48,13 +65,16 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi

region_bound = (intptr_t) (region->base + region->len);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)",
(void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index,
min_index, max_index);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, "
"max_index = %d)", (void *) base, (void *) bound, (void *) region->base,
(void *)(region->base + region->len), mid_index, min_index, max_index);

if (region->base > base) {
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index);
} else if (bound <= region_bound) {
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size,
region_index);
}

if (bound <= region_bound) {
if (region_index) {
*region_index = mid_index;
}
Expand All @@ -66,24 +86,76 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
}

/* binary search for insertion point */
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base,
size_t region_size, int *region_index)
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index,
intptr_t base, size_t region_size, int *region_index)
{
int mid_index = (max_index + min_index) >> 1;
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base, region_size);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base,
region_size);

if (max_index < min_index) {
*region_index = min_index;
return (ompi_osc_rdma_region_t *)((intptr_t) regions + min_index * region_size);
}

if (region->base > base) {
if (region->base > base || (region->base == base && region->len > region_size)) {
return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index);
} else {
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
}

return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
}

static bool ompi_osc_rdma_find_conflicting_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, intptr_t bound)
{
ompi_osc_rdma_attachment_t *attachment;

OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
intptr_t region_bound = attachment->base + attachment->len;
if (base >= attachment->base && base < region_bound ||
bound > attachment->base && bound <= region_bound) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "existing region {%p, %p} overlaps region {%p, %p}",
(void *) attachment->base, (void *) region_bound, (void *) base, (void *) bound);
return true;
}
}

return false;
}

static int ompi_osc_rdma_add_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, size_t len)
{
ompi_osc_rdma_attachment_t *attachment = OBJ_NEW(ompi_osc_rdma_attachment_t);
assert (NULL != attachment);

if (ompi_osc_rdma_find_conflicting_attachment(handle, base, base + len)) {
return OMPI_ERR_RMA_ATTACH;
}

attachment->base = base;
attachment->len = len;

opal_list_append (&handle->attachments, &attachment->super);

return OMPI_SUCCESS;
}

static int ompi_osc_rdma_remove_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base)
{
ompi_osc_rdma_attachment_t *attachment;

OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "checking attachment %p against %p",
(void *) attachment->base, (void *) base);
if (attachment->base == (intptr_t) base) {
opal_list_remove_item (&handle->attachments, &attachment->super);
OBJ_RELEASE(attachment);
return OMPI_SUCCESS;
}
}

return OMPI_ERR_NOT_FOUND;
}

int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
Expand All @@ -92,12 +164,13 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
const int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
ompi_osc_rdma_region_t *region;
ompi_osc_rdma_handle_t *rdma_region_handle;
osc_rdma_counter_t region_count;
osc_rdma_counter_t region_id;
void *bound;
intptr_t bound, aligned_base, aligned_bound;
intptr_t page_size = opal_getpagesize ();
int region_index;
int ret;
int region_index, ret;
size_t aligned_len;

if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
return OMPI_ERR_RMA_FLAVOR;
Expand All @@ -117,23 +190,29 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)

if (region_count == mca_osc_rdma_component.max_attach) {
OPAL_THREAD_UNLOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: could not attach. max attachment count reached.");
return OMPI_ERR_RMA_ATTACH;
}

/* it is wasteful to register less than a page. this may allow the remote side to access more
* memory but the MPI standard covers this with calling the calling behavior erroneous */
bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
base = (void *)((intptr_t) base & ~(page_size - 1));
len = (size_t)((intptr_t) bound - (intptr_t) base);

/* see if a matching region already exists */
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
(intptr_t) bound, module->region_size, &region_index);
bound = (intptr_t) base + len;
aligned_bound = OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
aligned_base = (intptr_t) base & ~(page_size - 1);
aligned_len = (size_t)(aligned_bound - aligned_base);

/* see if a registered region already exists */
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
aligned_base, aligned_bound, module->region_size, &region_index);
if (NULL != region) {
++module->dynamic_handles[region_index].refcnt;
/* validates that the region does not overlap with an existing region even if they are on the same page */
ret = ompi_osc_rdma_add_attachment (module->dynamic_handles[region_index], (intptr_t) base, len);
OPAL_THREAD_UNLOCK(&module->lock);
if (OMPI_SUCCESS != ret) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "Add attachment failed with code %d", ret);
}
/* no need to invalidate remote caches */
return OMPI_SUCCESS;
return ret;
}

/* region is in flux */
Expand All @@ -144,45 +223,50 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)

/* do a binary seach for where the region should be inserted */
if (region_count) {
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
module->region_size, &region_index);
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
(intptr_t) base, module->region_size, &region_index);

if (region_index < region_count) {
memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);

if (module->selected_btl->btl_register_mem) {
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
}
memmove ((void *) ((intptr_t) region + module->region_size), region,
(region_count - region_index) * module->region_size);
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
}
} else {
region_index = 0;
region = (ompi_osc_rdma_region_t *) module->state->regions;
}

region->base = (intptr_t) base;
region->len = len;
region->base = aligned_base;
region->len = aligned_len;

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d",
base, (void *)((intptr_t) base + len), region_index);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d",
base, (void *) bound, (void *) aligned_base, (void *) aligned_bound, region_index);

/* add RDMA region handle to track this region */
rdma_region_handle = OBJ_NEW(ompi_osc_rdma_handle_t);
assert (NULL != rdma_region_handle);

if (module->selected_btl->btl_register_mem) {
mca_btl_base_registration_handle_t *handle;

ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
&handle);
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len,
MCA_BTL_REG_FLAG_ACCESS_ANY, &handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_UNLOCK(&module->lock);
OBJ_RELEASE(rdma_region_handle);
return OMPI_ERR_RMA_ATTACH;
}

memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
module->dynamic_handles[region_index].btl_handle = handle;
rdma_region_handle->btl_handle = handle;
} else {
module->dynamic_handles[region_index].btl_handle = NULL;
rdma_region_handle->btl_handle = NULL;
}

module->dynamic_handles[region_index].refcnt = 1;
assert(OMPI_SUCCESS == ompi_osc_rdma_add_attachment (rdma_region_handle, (intptr_t) base, len));

module->dynamic_handles[region_index] = rdma_region_handle;

#if OPAL_ENABLE_DEBUG
for (int i = 0 ; i < region_count + 1 ; ++i) {
Expand Down Expand Up @@ -211,34 +295,46 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
ompi_osc_rdma_module_t *module = GET_MODULE(win);
const int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank);
ompi_osc_rdma_handle_t *rdma_region_handle;
osc_rdma_counter_t region_count, region_id;
ompi_osc_rdma_region_t *region;
int region_index;
void *bound;
int start_index = INT_MAX, region_index;

if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
return OMPI_ERR_WIN;
}

OPAL_THREAD_LOCK(&module->lock);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach: %s, %p", win->w_name, base);

/* the upper 4 bytes of the region count are an instance counter */
region_count = module->state->region_count & 0xffffffffL;
region_id = module->state->region_count >> 32;

region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0,
region_count - 1, (intptr_t) base, (intptr_t) base + 1,
module->region_size, &region_index);
if (NULL == region) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory region starting at %p", base);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERROR;
/* look up the associated region */
for (region_index = 0 ; region_index < region_count ; ++region_index) {
rdma_region_handle = module->dynamic_handles[region_index];
region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + region_index * module->region_size);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking attachments at index %d {.base=%p, len=%lu} for attachment %p"
", region handle=%p", region_index, (void *) region->base, region->len, base, rdma_region_handle);

if (region->base > (uintptr_t) base || (region->base + region->len) < (uintptr_t) base) {
continue;
}

if (OPAL_SUCCESS == ompi_osc_rdma_remove_attachment (rdma_region_handle, (intptr_t) base)) {
break;
}
}

if (--module->dynamic_handles[region_index].refcnt > 0) {
if (region_index == region_count) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory attachment for %p", base);
OPAL_THREAD_UNLOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete");
return OMPI_ERR_BASE;
}

if (!opal_list_is_empty (&rdma_region_handle->attachments)) {
/* another region is referencing this attachment */
return OMPI_SUCCESS;
}

Expand All @@ -249,21 +345,21 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
base, (void *)((intptr_t) base + region->len), region_index);

if (module->selected_btl->btl_register_mem) {
ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle);
ompi_osc_rdma_deregister (module, rdma_region_handle->btl_handle);

if (region_index < region_count - 1) {
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
(region_count - region_index - 1) * sizeof (void *));
}

memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0]));
}

if (region_index < region_count - 1) {
size_t end_count = region_count - region_index - 1;
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
end_count * sizeof (module->dynamic_handles[0]));
memmove (region, (void *)((intptr_t) region + module->region_size),
(region_count - region_index - 1) * module->region_size);;
end_count * module->region_size);
}

OBJ_RELEASE(rdma_region_handle);
module->dynamic_handles[region_count - 1] = NULL;

module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);

ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
Expand Down
Loading

0 comments on commit 96ed630

Please sign in to comment.