Skip to content

Commit 96ed630

Browse files
committed
osc/rdma: modify attach to check for region overlap
This commit addresses two issues in osc/rdma: 1) It is erroneous to attach regions that overlap. This was being allowed but the standard does not allow overlapping attachments. 2) Overlapping registration regions (4k alignment of attachments) appear to be allowed. Add attachment bases to the bookeeping structure so we can keep better track of what can be detached. It is possible that the standard did not intend to allow #2. If that is the case then #2 should fail in the same way as #1. There should be no technical reason to disallow #2 at this time. References #7384 Signed-off-by: Nathan Hjelm <hjelmn@google.com>
1 parent 5025628 commit 96ed630

File tree

5 files changed

+186
-67
lines changed

5 files changed

+186
-67
lines changed

ompi/mca/osc/rdma/osc_rdma.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
1515
* Copyright (c) 2016-2018 Intel, Inc. All rights reserved.
16+
* Copyright (c) 2020 Google, LLC. All rights reserved.
1617
* $COPYRIGHT$
1718
*
1819
* Additional copyrights may follow
@@ -250,7 +251,7 @@ struct ompi_osc_rdma_module_t {
250251

251252
/** registration handles for dynamically attached regions. These are not stored
252253
* in the state structure as it is entirely local. */
253-
ompi_osc_rdma_handle_t *dynamic_handles;
254+
ompi_osc_rdma_handle_t **dynamic_handles;
254255

255256
/** shared memory segment. this segment holds this node's portion of the rank -> node
256257
* mapping array, node communication data (node_comm_info), state for all local ranks,

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
2222
* Copyright (c) 2019 Research Organization for Information Science
2323
* and Technology (RIST). All rights reserved.
24+
* Copyright (c) 2020 Google, LLC. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -218,7 +219,7 @@ static int ompi_osc_rdma_component_register (void)
218219
MCA_BASE_VAR_SCOPE_LOCAL, &mca_osc_rdma_component.buffer_size);
219220
free(description_str);
220221

221-
mca_osc_rdma_component.max_attach = 32;
222+
mca_osc_rdma_component.max_attach = 64;
222223
opal_asprintf(&description_str, "Maximum number of buffers that can be attached to a dynamic window. "
223224
"Keep in mind that each attached buffer will use a potentially limited "
224225
"resource (default: %d)", mca_osc_rdma_component.max_attach);
@@ -1267,8 +1268,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
12671268

12681269
if (MPI_WIN_FLAVOR_DYNAMIC == flavor) {
12691270
/* allocate space to store local btl handles for attached regions */
1270-
module->dynamic_handles = (ompi_osc_rdma_handle_t *) calloc (mca_osc_rdma_component.max_attach,
1271-
sizeof (module->dynamic_handles[0]));
1271+
module->dynamic_handles = (ompi_osc_rdma_handle_t **) calloc (mca_osc_rdma_component.max_attach,
1272+
sizeof (module->dynamic_handles[0]));
12721273
if (NULL == module->dynamic_handles) {
12731274
ompi_osc_rdma_free (win);
12741275
return OMPI_ERR_OUT_OF_RESOURCE;

ompi/mca/osc/rdma/osc_rdma_dynamic.c

Lines changed: 156 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2020 Google, LLC. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -16,6 +17,22 @@
1617

1718
#include "opal/util/sys_limits.h"
1819

20+
static void ompi_osc_rdma_handle_init (ompi_osc_rdma_handle_t *rdma_handle)
21+
{
22+
rdma_handle->btl_handle = NULL;
23+
OBJ_CONSTRUCT(&rdma_handle->attachments, opal_list_t);
24+
}
25+
26+
static void ompi_osc_rdma_handle_fini (ompi_osc_rdma_handle_t *rdma_handle)
27+
{
28+
OPAL_LIST_DESTRUCT(&rdma_handle->attachments);
29+
}
30+
31+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_handle_t, opal_object_t, ompi_osc_rdma_handle_init,
32+
ompi_osc_rdma_handle_fini);
33+
34+
OBJ_CLASS_INSTANCE(ompi_osc_rdma_attachment_t, opal_list_item_t, NULL, NULL);
35+
1936
/**
2037
* ompi_osc_rdma_find_region_containing:
2138
*
@@ -48,13 +65,16 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
4865

4966
region_bound = (intptr_t) (region->base + region->len);
5067

51-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)",
52-
(void *) base, (void *) bound, (void *) region->base, (void *)(region->base + region->len), mid_index,
53-
min_index, max_index);
68+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, "
69+
"max_index = %d)", (void *) base, (void *) bound, (void *) region->base,
70+
(void *)(region->base + region->len), mid_index, min_index, max_index);
5471

5572
if (region->base > base) {
56-
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size, region_index);
57-
} else if (bound <= region_bound) {
73+
return ompi_osc_rdma_find_region_containing (regions, min_index, mid_index-1, base, bound, region_size,
74+
region_index);
75+
}
76+
77+
if (bound <= region_bound) {
5878
if (region_index) {
5979
*region_index = mid_index;
6080
}
@@ -66,24 +86,76 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
6686
}
6787

6888
/* binary search for insertion point */
69-
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index, intptr_t base,
70-
size_t region_size, int *region_index)
89+
static ompi_osc_rdma_region_t *find_insertion_point (ompi_osc_rdma_region_t *regions, int min_index, int max_index,
90+
intptr_t base, size_t region_size, int *region_index)
7191
{
7292
int mid_index = (max_index + min_index) >> 1;
7393
ompi_osc_rdma_region_t *region = (ompi_osc_rdma_region_t *)((intptr_t) regions + mid_index * region_size);
7494

75-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base, region_size);
95+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "find_insertion_point (%d, %d, %lx, %lu)\n", min_index, max_index, base,
96+
region_size);
7697

7798
if (max_index < min_index) {
7899
*region_index = min_index;
79100
return (ompi_osc_rdma_region_t *)((intptr_t) regions + min_index * region_size);
80101
}
81102

82-
if (region->base > base) {
103+
if (region->base > base || (region->base == base && region->len > region_size)) {
83104
return find_insertion_point (regions, min_index, mid_index-1, base, region_size, region_index);
84-
} else {
85-
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
86105
}
106+
107+
return find_insertion_point (regions, mid_index+1, max_index, base, region_size, region_index);
108+
}
109+
110+
static bool ompi_osc_rdma_find_conflicting_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, intptr_t bound)
111+
{
112+
ompi_osc_rdma_attachment_t *attachment;
113+
114+
OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
115+
intptr_t region_bound = attachment->base + attachment->len;
116+
if (base >= attachment->base && base < region_bound ||
117+
bound > attachment->base && bound <= region_bound) {
118+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "existing region {%p, %p} overlaps region {%p, %p}",
119+
(void *) attachment->base, (void *) region_bound, (void *) base, (void *) bound);
120+
return true;
121+
}
122+
}
123+
124+
return false;
125+
}
126+
127+
static int ompi_osc_rdma_add_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base, size_t len)
128+
{
129+
ompi_osc_rdma_attachment_t *attachment = OBJ_NEW(ompi_osc_rdma_attachment_t);
130+
assert (NULL != attachment);
131+
132+
if (ompi_osc_rdma_find_conflicting_attachment(handle, base, base + len)) {
133+
return OMPI_ERR_RMA_ATTACH;
134+
}
135+
136+
attachment->base = base;
137+
attachment->len = len;
138+
139+
opal_list_append (&handle->attachments, &attachment->super);
140+
141+
return OMPI_SUCCESS;
142+
}
143+
144+
static int ompi_osc_rdma_remove_attachment (ompi_osc_rdma_handle_t *handle, intptr_t base)
145+
{
146+
ompi_osc_rdma_attachment_t *attachment;
147+
148+
OPAL_LIST_FOREACH(attachment, &handle->attachments, ompi_osc_rdma_attachment_t) {
149+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "checking attachment %p against %p",
150+
(void *) attachment->base, (void *) base);
151+
if (attachment->base == (intptr_t) base) {
152+
opal_list_remove_item (&handle->attachments, &attachment->super);
153+
OBJ_RELEASE(attachment);
154+
return OMPI_SUCCESS;
155+
}
156+
}
157+
158+
return OMPI_ERR_NOT_FOUND;
87159
}
88160

89161
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
@@ -92,12 +164,13 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
92164
const int my_rank = ompi_comm_rank (module->comm);
93165
ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank);
94166
ompi_osc_rdma_region_t *region;
167+
ompi_osc_rdma_handle_t *rdma_region_handle;
95168
osc_rdma_counter_t region_count;
96169
osc_rdma_counter_t region_id;
97-
void *bound;
170+
intptr_t bound, aligned_base, aligned_bound;
98171
intptr_t page_size = opal_getpagesize ();
99-
int region_index;
100-
int ret;
172+
int region_index, ret;
173+
size_t aligned_len;
101174

102175
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
103176
return OMPI_ERR_RMA_FLAVOR;
@@ -117,23 +190,29 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
117190

118191
if (region_count == mca_osc_rdma_component.max_attach) {
119192
OPAL_THREAD_UNLOCK(&module->lock);
193+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: could not attach. max attachment count reached.");
120194
return OMPI_ERR_RMA_ATTACH;
121195
}
122196

123197
/* it is wasteful to register less than a page. this may allow the remote side to access more
124198
* memory but the MPI standard covers this with calling the calling behavior erroneous */
125-
bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
126-
base = (void *)((intptr_t) base & ~(page_size - 1));
127-
len = (size_t)((intptr_t) bound - (intptr_t) base);
128-
129-
/* see if a matching region already exists */
130-
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
131-
(intptr_t) bound, module->region_size, &region_index);
199+
bound = (intptr_t) base + len;
200+
aligned_bound = OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t);
201+
aligned_base = (intptr_t) base & ~(page_size - 1);
202+
aligned_len = (size_t)(aligned_bound - aligned_base);
203+
204+
/* see if a registered region already exists */
205+
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
206+
aligned_base, aligned_bound, module->region_size, &region_index);
132207
if (NULL != region) {
133-
++module->dynamic_handles[region_index].refcnt;
208+
/* validates that the region does not overlap with an existing region even if they are on the same page */
209+
ret = ompi_osc_rdma_add_attachment (module->dynamic_handles[region_index], (intptr_t) base, len);
134210
OPAL_THREAD_UNLOCK(&module->lock);
211+
if (OMPI_SUCCESS != ret) {
212+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "Add attachment failed with code %d", ret);
213+
}
135214
/* no need to invalidate remote caches */
136-
return OMPI_SUCCESS;
215+
return ret;
137216
}
138217

139218
/* region is in flux */
@@ -144,45 +223,50 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
144223

145224
/* do a binary seach for where the region should be inserted */
146225
if (region_count) {
147-
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base,
148-
module->region_size, &region_index);
226+
region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1,
227+
(intptr_t) base, module->region_size, &region_index);
149228

150229
if (region_index < region_count) {
151-
memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size);
152-
153-
if (module->selected_btl->btl_register_mem) {
154-
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
155-
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
156-
}
230+
memmove ((void *) ((intptr_t) region + module->region_size), region,
231+
(region_count - region_index) * module->region_size);
232+
memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index,
233+
(region_count - region_index) * sizeof (module->dynamic_handles[0]));
157234
}
158235
} else {
159236
region_index = 0;
160237
region = (ompi_osc_rdma_region_t *) module->state->regions;
161238
}
162239

163-
region->base = (intptr_t) base;
164-
region->len = len;
240+
region->base = aligned_base;
241+
region->len = aligned_len;
165242

166-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d",
167-
base, (void *)((intptr_t) base + len), region_index);
243+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d",
244+
base, (void *) bound, (void *) aligned_base, (void *) aligned_bound, region_index);
245+
246+
/* add RDMA region handle to track this region */
247+
rdma_region_handle = OBJ_NEW(ompi_osc_rdma_handle_t);
248+
assert (NULL != rdma_region_handle);
168249

169250
if (module->selected_btl->btl_register_mem) {
170251
mca_btl_base_registration_handle_t *handle;
171252

172-
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY,
173-
&handle);
253+
ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len,
254+
MCA_BTL_REG_FLAG_ACCESS_ANY, &handle);
174255
if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) {
175256
OPAL_THREAD_UNLOCK(&module->lock);
257+
OBJ_RELEASE(rdma_region_handle);
176258
return OMPI_ERR_RMA_ATTACH;
177259
}
178260

179261
memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size);
180-
module->dynamic_handles[region_index].btl_handle = handle;
262+
rdma_region_handle->btl_handle = handle;
181263
} else {
182-
module->dynamic_handles[region_index].btl_handle = NULL;
264+
rdma_region_handle->btl_handle = NULL;
183265
}
184266

185-
module->dynamic_handles[region_index].refcnt = 1;
267+
assert(OMPI_SUCCESS == ompi_osc_rdma_add_attachment (rdma_region_handle, (intptr_t) base, len));
268+
269+
module->dynamic_handles[region_index] = rdma_region_handle;
186270

187271
#if OPAL_ENABLE_DEBUG
188272
for (int i = 0 ; i < region_count + 1 ; ++i) {
@@ -211,34 +295,46 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
211295
ompi_osc_rdma_module_t *module = GET_MODULE(win);
212296
const int my_rank = ompi_comm_rank (module->comm);
213297
ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank);
298+
ompi_osc_rdma_handle_t *rdma_region_handle;
214299
osc_rdma_counter_t region_count, region_id;
215300
ompi_osc_rdma_region_t *region;
216-
int region_index;
301+
void *bound;
302+
int start_index = INT_MAX, region_index;
217303

218304
if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) {
219305
return OMPI_ERR_WIN;
220306
}
221307

222308
OPAL_THREAD_LOCK(&module->lock);
223309

224-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach: %s, %p", win->w_name, base);
225-
226310
/* the upper 4 bytes of the region count are an instance counter */
227311
region_count = module->state->region_count & 0xffffffffL;
228312
region_id = module->state->region_count >> 32;
229313

230-
region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0,
231-
region_count - 1, (intptr_t) base, (intptr_t) base + 1,
232-
module->region_size, &region_index);
233-
if (NULL == region) {
234-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory region starting at %p", base);
235-
OPAL_THREAD_UNLOCK(&module->lock);
236-
return OMPI_ERROR;
314+
/* look up the associated region */
315+
for (region_index = 0 ; region_index < region_count ; ++region_index) {
316+
rdma_region_handle = module->dynamic_handles[region_index];
317+
region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + region_index * module->region_size);
318+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking attachments at index %d {.base=%p, len=%lu} for attachment %p"
319+
", region handle=%p", region_index, (void *) region->base, region->len, base, rdma_region_handle);
320+
321+
if (region->base > (uintptr_t) base || (region->base + region->len) < (uintptr_t) base) {
322+
continue;
323+
}
324+
325+
if (OPAL_SUCCESS == ompi_osc_rdma_remove_attachment (rdma_region_handle, (intptr_t) base)) {
326+
break;
327+
}
237328
}
238329

239-
if (--module->dynamic_handles[region_index].refcnt > 0) {
330+
if (region_index == region_count) {
331+
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory attachment for %p", base);
240332
OPAL_THREAD_UNLOCK(&module->lock);
241-
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete");
333+
return OMPI_ERR_BASE;
334+
}
335+
336+
if (!opal_list_is_empty (&rdma_region_handle->attachments)) {
337+
/* another region is referencing this attachment */
242338
return OMPI_SUCCESS;
243339
}
244340

@@ -249,21 +345,21 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
249345
base, (void *)((intptr_t) base + region->len), region_index);
250346

251347
if (module->selected_btl->btl_register_mem) {
252-
ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle);
348+
ompi_osc_rdma_deregister (module, rdma_region_handle->btl_handle);
253349

254-
if (region_index < region_count - 1) {
255-
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
256-
(region_count - region_index - 1) * sizeof (void *));
257-
}
258-
259-
memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0]));
260350
}
261351

262352
if (region_index < region_count - 1) {
353+
size_t end_count = region_count - region_index - 1;
354+
memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1,
355+
end_count * sizeof (module->dynamic_handles[0]));
263356
memmove (region, (void *)((intptr_t) region + module->region_size),
264-
(region_count - region_index - 1) * module->region_size);;
357+
end_count * module->region_size);
265358
}
266359

360+
OBJ_RELEASE(rdma_region_handle);
361+
module->dynamic_handles[region_count - 1] = NULL;
362+
267363
module->state->region_count = ((region_id + 1) << 32) | (region_count - 1);
268364

269365
ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock));

0 commit comments

Comments
 (0)