Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

osc/rdma: modify attach to check for region overlap #7387

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
* Copyright (c) 2020 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -250,7 +251,7 @@ struct ompi_osc_rdma_module_t {

/** registration handles for dynamically attached regions. These are not stored
* in the state structure as it is entirely local. */
ompi_osc_rdma_handle_t *dynamic_handles;
ompi_osc_rdma_handle_t **dynamic_handles;

/** shared memory segment. this segment holds this node's portion of the rank -> node
* mapping array, node communication data (node_comm_info), state for all local ranks,
Expand Down
9 changes: 5 additions & 4 deletions ompi/mca/osc/rdma/osc_rdma_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2019 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2020 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -218,11 +219,11 @@ static int ompi_osc_rdma_component_register (void)
MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size);
free(description_str);

mca_osc_rdma_component.max_attach = 32;
mca_osc_rdma_component.max_attach = 64;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Call out this change in the commit log? :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I should probably break it out into a different commit. I realized that 32, while safer for Cray systems, is probably too low for most use cases.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

opal_asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
"Keep in mind that each attached buffer will use a potentially limited "
"resource (default: %d)", mca_osc_rdma_component.max_attach);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "max_attach", description_str,
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_GROUP, &mca_osc_rdma_component.max_attach);
free(description_str);
Expand Down Expand Up @@ -1267,8 +1268,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,

if (MPI_WIN_FLAVOR_DYNAMIC == flavor) {
/* allocate space to store local btl handles for attached regions */
module->dynamic_handles = (ompi_osc_rdma_handle_t *) calloc (mca_osc_rdma_component.max_attach,
sizeof (module->dynamic_handles[0]));
module->dynamic_handles = (ompi_osc_rdma_handle_t **) calloc (mca_osc_rdma_component.max_attach,
sizeof (module->dynamic_handles[0]));
if (NULL == module->dynamic_handles) {
ompi_osc_rdma_free (win);
return OMPI_ERR_OUT_OF_RESOURCE;
Expand Down
213 changes: 153 additions & 60 deletions ompi/mca/osc/rdma/osc_rdma_dynamic.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/*
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2020 Google, LLC. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -16,6 +17,22 @@

#include "opal/util/sys_limits.h"

static void ompi_osc_rdma_handle_init (ompi_osc_rdma_handle_t *rdma_handle)
{
rdma_handle->btl_handle = NULL;
OBJ_CONSTRUCT(&rdma_handle->attachments, opal_list_t);
}

static void ompi_osc_rdma_handle_fini (ompi_osc_rdma_handle_t *rdma_handle)
{
OPAL_LIST_DESTRUCT(&rdma_handle->attachments);
}

OBJ_CLASS_INSTANCE(ompi_osc_rdma_handle_t, opal_object_t, ompi_osc_rdma_handle_init,
ompi_osc_rdma_handle_fini);

OBJ_CLASS_INSTANCE(ompi_osc_rdma_attachment_t, opal_list_item_t, NULL, NULL);

/**
* ompi_osc_rdma_find_region_containing:
*
Expand Down Expand Up @@ -48,13 +65,16 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi

region_bound = (intptr_t) (region->base + region->len);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)",
(void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index,
min_index, max_index);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, "
"max_index = %d)", (void *) base, (void *) bound, (void *) region->base,
(void *)(region->base + region->len), mid_index, min_index, max_index);

if (region->base > base) {
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index);
} else if (bound <= region_bound) {
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size,
region_index);
}

if (bound <= region_bound) {
if (region_index) {
*region_index = mid_index;
}
Expand All @@ -66,24 +86,76 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
}

/* binary search for insertion point */
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base,
size_t region_size, int *region_index)
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index,
intptr_t base, size_t region_size, int *region_index)
{
int mid_index = (max_index + min_index) >> 1;
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base, region_size);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base,
region_size);

if (max_index < min_index) {
*region_index = min_index;
return (ompi_osc_rdma_region_t *)((intptr_t) regions + min_index * region_size);
}

if (region->base > base) {
if (region->base > base || (region->base == base && region->len > region_size)) {
return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index);
} else {
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
}

return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
}

static bool ompi_osc_rdma_find_conflicting_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, intptr_t bound)
{
ompi_osc_rdma_attachment_t *attachment;

OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
intptr_t region_bound = attachment->base + attachment->len;
if (base >= attachment->base && base < region_bound ||
bound > attachment->base && bound <= region_bound) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "existing region {%p, %p} overlaps region {%p, %p}",
(void *) attachment->base, (void *) region_bound, (void *) base, (void *) bound);
return true;
}
}

return false;
}

static int ompi_osc_rdma_add_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, size_t len)
{
ompi_osc_rdma_attachment_t *attachment = OBJ_NEW(ompi_osc_rdma_attachment_t);
assert (NULL != attachment);

if (ompi_osc_rdma_find_conflicting_attachment(handle, base, base + len)) {
return OMPI_ERR_RMA_ATTACH;
}

attachment->base = base;
attachment->len = len;

opal_list_append (&handle->attachments, &attachment->super);

return OMPI_SUCCESS;
}

static int ompi_osc_rdma_remove_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base)
{
ompi_osc_rdma_attachment_t *attachment;

OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "checking attachment %p against %p",
(void *) attachment->base, (void *) base);
if (attachment->base == (intptr_t) base) {
opal_list_remove_item (&handle->attachments, &attachment->super);
OBJ_RELEASE(attachment);
return OMPI_SUCCESS;
}
}

return OMPI_ERR_NOT_FOUND;
}

int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
Expand All @@ -92,12 +164,13 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
const int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
ompi_osc_rdma_region_t *region;
ompi_osc_rdma_handle_t *rdma_region_handle;
osc_rdma_counter_t region_count;
osc_rdma_counter_t region_id;
void *bound;
intptr_t bound, aligned_base, aligned_bound;
intptr_t page_size = opal_getpagesize ();
int region_index;
int ret;
int region_index, ret;
size_t aligned_len;

if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
return OMPI_ERR_RMA_FLAVOR;
Expand All @@ -117,23 +190,26 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)

if (region_count == mca_osc_rdma_component.max_attach) {
OPAL_THREAD_UNLOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: could not attach. max attachment count reached.");
return OMPI_ERR_RMA_ATTACH;
}

/* it is wasteful to register less than a page. this may allow the remote side to access more
* memory but the MPI standard covers this with calling the calling behavior erroneous */
bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
base = (void *)((intptr_t) base & ~(page_size - 1));
len = (size_t)((intptr_t) bound - (intptr_t) base);

/* see if a matching region already exists */
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
(intptr_t) bound, module->region_size, &region_index);
bound = (intptr_t) base + len;
aligned_bound = OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
aligned_base = (intptr_t) base & ~(page_size - 1);
aligned_len = (size_t)(aligned_bound - aligned_base);

/* see if a registered region already exists */
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
aligned_base, aligned_bound, module->region_size, &region_index);
if (NULL != region) {
++module->dynamic_handles[region_index].refcnt;
/* validates that the region does not overlap with an existing region even if they are on the same page */
ret = ompi_osc_rdma_add_attachment (module->dynamic_handles[region_index], (intptr_t) base, len);
OPAL_THREAD_UNLOCK(&module->lock);
/* no need to invalidate remote caches */
return OMPI_SUCCESS;
return ret;
}

/* region is in flux */
Expand All @@ -144,45 +220,50 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)

/* do a binary seach for where the region should be inserted */
if (region_count) {
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
module->region_size, &region_index);
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
(intptr_t) base, module->region_size, &region_index);

if (region_index < region_count) {
memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);

if (module->selected_btl->btl_register_mem) {
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
}
memmove ((void *) ((intptr_t) region + module->region_size), region,
(region_count - region_index) * module->region_size);
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
}
} else {
region_index = 0;
region = (ompi_osc_rdma_region_t *) module->state->regions;
}

region->base = (intptr_t) base;
region->len = len;
region->base = aligned_base;
region->len = aligned_len;

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d",
base, (void *) bound, (void *) aligned_base, (void *) aligned_bound, region_index);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d",
base, (void *)((intptr_t) base + len), region_index);
/* add RDMA region handle to track this region */
rdma_region_handle = OBJ_NEW(ompi_osc_rdma_handle_t);
assert (NULL != rdma_region_handle);

if (module->selected_btl->btl_register_mem) {
mca_btl_base_registration_handle_t *handle;

ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
&handle);
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len,
MCA_BTL_REG_FLAG_ACCESS_ANY, &handle);
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
OPAL_THREAD_UNLOCK(&module->lock);
OBJ_RELEASE(rdma_region_handle);
return OMPI_ERR_RMA_ATTACH;
}

memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
module->dynamic_handles[region_index].btl_handle = handle;
rdma_region_handle->btl_handle = handle;
} else {
module->dynamic_handles[region_index].btl_handle = NULL;
rdma_region_handle->btl_handle = NULL;
}

module->dynamic_handles[region_index].refcnt = 1;
assert(OMPI_SUCCESS == ompi_osc_rdma_add_attachment (rdma_region_handle, (intptr_t) base, len));

module->dynamic_handles[region_index] = rdma_region_handle;

#if OPAL_ENABLE_DEBUG
for (int i = 0 ; i < region_count + 1 ; ++i) {
Expand Down Expand Up @@ -211,34 +292,46 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
ompi_osc_rdma_module_t *module = GET_MODULE(win);
const int my_rank = ompi_comm_rank (module->comm);
ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank);
ompi_osc_rdma_handle_t *rdma_region_handle;
osc_rdma_counter_t region_count, region_id;
ompi_osc_rdma_region_t *region;
int region_index;
void *bound;
int start_index = INT_MAX, region_index;

if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
return OMPI_ERR_WIN;
}

OPAL_THREAD_LOCK(&module->lock);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach: %s, %p", win->w_name, base);

/* the upper 4 bytes of the region count are an instance counter */
region_count = module->state->region_count & 0xffffffffL;
region_id = module->state->region_count >> 32;

region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0,
region_count - 1, (intptr_t) base, (intptr_t) base + 1,
module->region_size, &region_index);
if (NULL == region) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory region starting at %p", base);
OPAL_THREAD_UNLOCK(&module->lock);
return OMPI_ERROR;
/* look up the associated region */
for (region_index = 0 ; region_index < region_count ; ++region_index) {
rdma_region_handle = module->dynamic_handles[region_index];
region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + region_index * module->region_size);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking attachments at index %d {.base=%p, len=%lu} for attachment %p"
", region handle=%p", region_index, (void *) region->base, region->len, base, rdma_region_handle);

if (region->base > (uintptr_t) base || (region->base + region->len) < (uintptr_t) base) {
continue;
}

if (OPAL_SUCCESS == ompi_osc_rdma_remove_attachment (rdma_region_handle, (intptr_t) base)) {
break;
}
}

if (--module->dynamic_handles[region_index].refcnt > 0) {
if (region_index == region_count) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory attachment for %p", base);
OPAL_THREAD_UNLOCK(&module->lock);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete");
return OMPI_ERR_BASE;
}

if (!opal_list_is_empty (&rdma_region_handle->attachments)) {
/* another region is referencing this attachment */
return OMPI_SUCCESS;
}

Expand All @@ -249,21 +342,21 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
base, (void *)((intptr_t) base + region->len), region_index);

if (module->selected_btl->btl_register_mem) {
ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle);
ompi_osc_rdma_deregister (module, rdma_region_handle->btl_handle);

if (region_index < region_count - 1) {
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
(region_count - region_index - 1) * sizeof (void *));
}

memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0]));
}

if (region_index < region_count - 1) {
size_t end_count = region_count - region_index - 1;
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
end_count * sizeof (module->dynamic_handles[0]));
memmove (region, (void *)((intptr_t) region + module->region_size),
(region_count - region_index - 1) * module->region_size);;
end_count * module->region_size);
}

OBJ_RELEASE(rdma_region_handle);
module->dynamic_handles[region_count - 1] = NULL;

module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);

ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));
Expand Down
Loading