22/*
33 * Copyright (c) 2014-2016 Los Alamos National Security, LLC. All rights
44 * reserved.
5+ * Copyright (c) 2020 Google, LLC. All rights reserved.
56 * $COPYRIGHT$
67 *
78 * Additional copyrights may follow
1617
1718#include "opal/util/sys_limits.h"
1819
20+ static void ompi_osc_rdma_handle_init (ompi_osc_rdma_handle_t * rdma_handle )
21+ {
22+ rdma_handle -> btl_handle = NULL ;
23+ OBJ_CONSTRUCT (& rdma_handle -> attachments , opal_list_t );
24+ }
25+
26+ static void ompi_osc_rdma_handle_fini (ompi_osc_rdma_handle_t * rdma_handle )
27+ {
28+ OPAL_LIST_DESTRUCT (& rdma_handle -> attachments );
29+ }
30+
31+ OBJ_CLASS_INSTANCE (ompi_osc_rdma_handle_t , opal_object_t , ompi_osc_rdma_handle_init ,
32+ ompi_osc_rdma_handle_fini );
33+
34+ OBJ_CLASS_INSTANCE (ompi_osc_rdma_attachment_t , opal_list_item_t , NULL , NULL );
35+
1936/**
2037 * ompi_osc_rdma_find_region_containing:
2138 *
@@ -48,13 +65,16 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
4865
4966 region_bound = (intptr_t ) (region -> base + region -> len );
5067
51- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_DEBUG , "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, max_index = %d)" ,
52- (void * ) base , (void * ) bound , (void * ) region -> base , ( void * )( region -> base + region -> len ), mid_index ,
53- min_index , max_index );
68+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_DEBUG , "checking memory region %p-%p against %p-%p (index %d) (min_index = %d, "
69+ "max_index = %d)" , (void * ) base , (void * ) bound , (void * ) region -> base ,
70+ ( void * )( region -> base + region -> len ), mid_index , min_index , max_index );
5471
5572 if (region -> base > base ) {
56- return ompi_osc_rdma_find_region_containing (regions , min_index , mid_index - 1 , base , bound , region_size , region_index );
57- } else if (bound <= region_bound ) {
73+ return ompi_osc_rdma_find_region_containing (regions , min_index , mid_index - 1 , base , bound , region_size ,
74+ region_index );
75+ }
76+
77+ if (bound <= region_bound ) {
5878 if (region_index ) {
5979 * region_index = mid_index ;
6080 }
@@ -66,24 +86,76 @@ static inline ompi_osc_rdma_region_t *ompi_osc_rdma_find_region_containing (ompi
6686}
6787
6888/* binary search for insertion point */
69- static ompi_osc_rdma_region_t * find_insertion_point (ompi_osc_rdma_region_t * regions , int min_index , int max_index , intptr_t base ,
70- size_t region_size , int * region_index )
89+ static ompi_osc_rdma_region_t * find_insertion_point (ompi_osc_rdma_region_t * regions , int min_index , int max_index ,
90+ intptr_t base , size_t region_size , int * region_index )
7191{
7292 int mid_index = (max_index + min_index ) >> 1 ;
7393 ompi_osc_rdma_region_t * region = (ompi_osc_rdma_region_t * )((intptr_t ) regions + mid_index * region_size );
7494
75- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "find_insertion_point (%d, %d, %lx, %lu)\n" , min_index , max_index , base , region_size );
95+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "find_insertion_point (%d, %d, %lx, %lu)\n" , min_index , max_index , base ,
96+ region_size );
7697
7798 if (max_index < min_index ) {
7899 * region_index = min_index ;
79100 return (ompi_osc_rdma_region_t * )((intptr_t ) regions + min_index * region_size );
80101 }
81102
82- if (region -> base > base ) {
103+ if (region -> base > base || ( region -> base == base && region -> len > region_size ) ) {
83104 return find_insertion_point (regions , min_index , mid_index - 1 , base , region_size , region_index );
84- } else {
85- return find_insertion_point (regions , mid_index + 1 , max_index , base , region_size , region_index );
86105 }
106+
107+ return find_insertion_point (regions , mid_index + 1 , max_index , base , region_size , region_index );
108+ }
109+
110+ static bool ompi_osc_rdma_find_conflicting_attachment (ompi_osc_rdma_handle_t * handle , intptr_t base , intptr_t bound )
111+ {
112+ ompi_osc_rdma_attachment_t * attachment ;
113+
114+ OPAL_LIST_FOREACH (attachment , & handle -> attachments , ompi_osc_rdma_attachment_t ) {
115+ intptr_t region_bound = attachment -> base + attachment -> len ;
116+ if (base >= attachment -> base && base < region_bound ||
117+ bound > attachment -> base && bound <= region_bound ) {
118+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "existing region {%p, %p} overlaps region {%p, %p}" ,
119+ (void * ) attachment -> base , (void * ) region_bound , (void * ) base , (void * ) bound );
120+ return true;
121+ }
122+ }
123+
124+ return false;
125+ }
126+
127+ static int ompi_osc_rdma_add_attachment (ompi_osc_rdma_handle_t * handle , intptr_t base , size_t len )
128+ {
129+ ompi_osc_rdma_attachment_t * attachment = OBJ_NEW (ompi_osc_rdma_attachment_t );
130+ assert (NULL != attachment );
131+
132+ if (ompi_osc_rdma_find_conflicting_attachment (handle , base , base + len )) {
133+ return OMPI_ERR_RMA_ATTACH ;
134+ }
135+
136+ attachment -> base = base ;
137+ attachment -> len = len ;
138+
139+ opal_list_append (& handle -> attachments , & attachment -> super );
140+
141+ return OMPI_SUCCESS ;
142+ }
143+
144+ static int ompi_osc_rdma_remove_attachment (ompi_osc_rdma_handle_t * handle , intptr_t base )
145+ {
146+ ompi_osc_rdma_attachment_t * attachment ;
147+
148+ OPAL_LIST_FOREACH (attachment , & handle -> attachments , ompi_osc_rdma_attachment_t ) {
149+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "checking attachment %p against %p" ,
150+ (void * ) attachment -> base , (void * ) base );
151+ if (attachment -> base == (intptr_t ) base ) {
152+ opal_list_remove_item (& handle -> attachments , & attachment -> super );
153+ OBJ_RELEASE (attachment );
154+ return OMPI_SUCCESS ;
155+ }
156+ }
157+
158+ return OMPI_ERR_NOT_FOUND ;
87159}
88160
89161int ompi_osc_rdma_attach (struct ompi_win_t * win , void * base , size_t len )
@@ -92,12 +164,13 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
92164 const int my_rank = ompi_comm_rank (module -> comm );
93165 ompi_osc_rdma_peer_t * my_peer = ompi_osc_rdma_module_peer (module , my_rank );
94166 ompi_osc_rdma_region_t * region ;
167+ ompi_osc_rdma_handle_t * rdma_region_handle ;
95168 osc_rdma_counter_t region_count ;
96169 osc_rdma_counter_t region_id ;
97- void * bound ;
170+ intptr_t bound , aligned_base , aligned_bound ;
98171 intptr_t page_size = opal_getpagesize ();
99- int region_index ;
100- int ret ;
172+ int region_index , ret ;
173+ size_t aligned_len ;
101174
102175 if (module -> flavor != MPI_WIN_FLAVOR_DYNAMIC ) {
103176 return OMPI_ERR_RMA_FLAVOR ;
@@ -117,23 +190,29 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
117190
118191 if (region_count == mca_osc_rdma_component .max_attach ) {
119192 OPAL_THREAD_UNLOCK (& module -> lock );
193+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "attach: could not attach. max attachment count reached." );
120194 return OMPI_ERR_RMA_ATTACH ;
121195 }
122196
123197 /* it is wasteful to register less than a page. this may allow the remote side to access more
124198 * memory but the MPI standard covers this with calling the calling behavior erroneous */
125- bound = (void * )OPAL_ALIGN ((intptr_t ) base + len , page_size , intptr_t );
126- base = (void * )((intptr_t ) base & ~(page_size - 1 ));
127- len = (size_t )((intptr_t ) bound - (intptr_t ) base );
128-
129- /* see if a matching region already exists */
130- region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 , region_count - 1 , (intptr_t ) base ,
131- (intptr_t ) bound , module -> region_size , & region_index );
199+ bound = (intptr_t ) base + len ;
200+ aligned_bound = OPAL_ALIGN ((intptr_t ) base + len , page_size , intptr_t );
201+ aligned_base = (intptr_t ) base & ~(page_size - 1 );
202+ aligned_len = (size_t )(aligned_bound - aligned_base );
203+
204+ /* see if a registered region already exists */
205+ region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 , region_count - 1 ,
206+ aligned_base , aligned_bound , module -> region_size , & region_index );
132207 if (NULL != region ) {
133- ++ module -> dynamic_handles [region_index ].refcnt ;
208+ /* validates that the region does not overlap with an existing region even if they are on the same page */
209+ ret = ompi_osc_rdma_add_attachment (module -> dynamic_handles [region_index ], (intptr_t ) base , len );
134210 OPAL_THREAD_UNLOCK (& module -> lock );
211+ if (OMPI_SUCCESS != ret ) {
212+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "Add attachment failed with code %d" , ret );
213+ }
135214 /* no need to invalidate remote caches */
136- return OMPI_SUCCESS ;
215+ return ret ;
137216 }
138217
139218 /* region is in flux */
@@ -144,45 +223,50 @@ int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len)
144223
145224 /* do a binary seach for where the region should be inserted */
146225 if (region_count ) {
147- region = find_insertion_point ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 , region_count - 1 , ( intptr_t ) base ,
148- module -> region_size , & region_index );
226+ region = find_insertion_point ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 , region_count - 1 ,
227+ ( intptr_t ) base , module -> region_size , & region_index );
149228
150229 if (region_index < region_count ) {
151- memmove ((void * ) ((intptr_t ) region + module -> region_size ), region , (region_count - region_index ) * module -> region_size );
152-
153- if (module -> selected_btl -> btl_register_mem ) {
154- memmove (module -> dynamic_handles + region_index + 1 , module -> dynamic_handles + region_index ,
155- (region_count - region_index ) * sizeof (module -> dynamic_handles [0 ]));
156- }
230+ memmove ((void * ) ((intptr_t ) region + module -> region_size ), region ,
231+ (region_count - region_index ) * module -> region_size );
232+ memmove (module -> dynamic_handles + region_index + 1 , module -> dynamic_handles + region_index ,
233+ (region_count - region_index ) * sizeof (module -> dynamic_handles [0 ]));
157234 }
158235 } else {
159236 region_index = 0 ;
160237 region = (ompi_osc_rdma_region_t * ) module -> state -> regions ;
161238 }
162239
163- region -> base = ( intptr_t ) base ;
164- region -> len = len ;
240+ region -> base = aligned_base ;
241+ region -> len = aligned_len ;
165242
166- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_DEBUG , "attaching dynamic memory region {%p, %p} at index %d" ,
167- base , (void * )((intptr_t ) base + len ), region_index );
243+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_DEBUG , "attaching dynamic memory region {%p, %p} aligned {%p, %p}, at index %d" ,
244+ base , (void * ) bound , (void * ) aligned_base , (void * ) aligned_bound , region_index );
245+
246+ /* add RDMA region handle to track this region */
247+ rdma_region_handle = OBJ_NEW (ompi_osc_rdma_handle_t );
248+ assert (NULL != rdma_region_handle );
168249
169250 if (module -> selected_btl -> btl_register_mem ) {
170251 mca_btl_base_registration_handle_t * handle ;
171252
172- ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , (void * ) region -> base , region -> len , MCA_BTL_REG_FLAG_ACCESS_ANY ,
173- & handle );
253+ ret = ompi_osc_rdma_register (module , MCA_BTL_ENDPOINT_ANY , (void * ) region -> base , region -> len ,
254+ MCA_BTL_REG_FLAG_ACCESS_ANY , & handle );
174255 if (OPAL_UNLIKELY (OMPI_SUCCESS != ret )) {
175256 OPAL_THREAD_UNLOCK (& module -> lock );
257+ OBJ_RELEASE (rdma_region_handle );
176258 return OMPI_ERR_RMA_ATTACH ;
177259 }
178260
179261 memcpy (region -> btl_handle_data , handle , module -> selected_btl -> btl_registration_handle_size );
180- module -> dynamic_handles [ region_index ]. btl_handle = handle ;
262+ rdma_region_handle -> btl_handle = handle ;
181263 } else {
182- module -> dynamic_handles [ region_index ]. btl_handle = NULL ;
264+ rdma_region_handle -> btl_handle = NULL ;
183265 }
184266
185- module -> dynamic_handles [region_index ].refcnt = 1 ;
267+ assert (OMPI_SUCCESS == ompi_osc_rdma_add_attachment (rdma_region_handle , (intptr_t ) base , len ));
268+
269+ module -> dynamic_handles [region_index ] = rdma_region_handle ;
186270
187271#if OPAL_ENABLE_DEBUG
188272 for (int i = 0 ; i < region_count + 1 ; ++ i ) {
@@ -211,34 +295,46 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
211295 ompi_osc_rdma_module_t * module = GET_MODULE (win );
212296 const int my_rank = ompi_comm_rank (module -> comm );
213297 ompi_osc_rdma_peer_dynamic_t * my_peer = (ompi_osc_rdma_peer_dynamic_t * ) ompi_osc_rdma_module_peer (module , my_rank );
298+ ompi_osc_rdma_handle_t * rdma_region_handle ;
214299 osc_rdma_counter_t region_count , region_id ;
215300 ompi_osc_rdma_region_t * region ;
216- int region_index ;
301+ void * bound ;
302+ int start_index = INT_MAX , region_index ;
217303
218304 if (module -> flavor != MPI_WIN_FLAVOR_DYNAMIC ) {
219305 return OMPI_ERR_WIN ;
220306 }
221307
222308 OPAL_THREAD_LOCK (& module -> lock );
223309
224- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "detach: %s, %p" , win -> w_name , base );
225-
226310 /* the upper 4 bytes of the region count are an instance counter */
227311 region_count = module -> state -> region_count & 0xffffffffL ;
228312 region_id = module -> state -> region_count >> 32 ;
229313
230- region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t * ) module -> state -> regions , 0 ,
231- region_count - 1 , (intptr_t ) base , (intptr_t ) base + 1 ,
232- module -> region_size , & region_index );
233- if (NULL == region ) {
234- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "could not find dynamic memory region starting at %p" , base );
235- OPAL_THREAD_UNLOCK (& module -> lock );
236- return OMPI_ERROR ;
314+ /* look up the associated region */
315+ for (region_index = 0 ; region_index < region_count ; ++ region_index ) {
316+ rdma_region_handle = module -> dynamic_handles [region_index ];
317+ region = (ompi_osc_rdma_region_t * ) ((intptr_t ) module -> state -> regions + region_index * module -> region_size );
318+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "checking attachments at index %d {.base=%p, len=%lu} for attachment %p"
319+ ", region handle=%p" , region_index , (void * ) region -> base , region -> len , base , rdma_region_handle );
320+
321+ if (region -> base > (uintptr_t ) base || (region -> base + region -> len ) < (uintptr_t ) base ) {
322+ continue ;
323+ }
324+
325+ if (OPAL_SUCCESS == ompi_osc_rdma_remove_attachment (rdma_region_handle , (intptr_t ) base )) {
326+ break ;
327+ }
237328 }
238329
239- if (-- module -> dynamic_handles [region_index ].refcnt > 0 ) {
330+ if (region_index == region_count ) {
331+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_INFO , "could not find dynamic memory attachment for %p" , base );
240332 OPAL_THREAD_UNLOCK (& module -> lock );
241- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "detach complete" );
333+ return OMPI_ERR_BASE ;
334+ }
335+
336+ if (!opal_list_is_empty (& rdma_region_handle -> attachments )) {
337+ /* another region is referencing this attachment */
242338 return OMPI_SUCCESS ;
243339 }
244340
@@ -249,21 +345,21 @@ int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base)
249345 base , (void * )((intptr_t ) base + region -> len ), region_index );
250346
251347 if (module -> selected_btl -> btl_register_mem ) {
252- ompi_osc_rdma_deregister (module , module -> dynamic_handles [ region_index ]. btl_handle );
348+ ompi_osc_rdma_deregister (module , rdma_region_handle -> btl_handle );
253349
254- if (region_index < region_count - 1 ) {
255- memmove (module -> dynamic_handles + region_index , module -> dynamic_handles + region_index + 1 ,
256- (region_count - region_index - 1 ) * sizeof (void * ));
257- }
258-
259- memset (module -> dynamic_handles + region_count - 1 , 0 , sizeof (module -> dynamic_handles [0 ]));
260350 }
261351
262352 if (region_index < region_count - 1 ) {
353+ size_t end_count = region_count - region_index - 1 ;
354+ memmove (module -> dynamic_handles + region_index , module -> dynamic_handles + region_index + 1 ,
355+ end_count * sizeof (module -> dynamic_handles [0 ]));
263356 memmove (region , (void * )((intptr_t ) region + module -> region_size ),
264- ( region_count - region_index - 1 ) * module -> region_size ); ;
357+ end_count * module -> region_size );
265358 }
266359
360+ OBJ_RELEASE (rdma_region_handle );
361+ module -> dynamic_handles [region_count - 1 ] = NULL ;
362+
267363 module -> state -> region_count = ((region_id + 1 ) << 32 ) | (region_count - 1 );
268364
269365 ompi_osc_rdma_lock_release_exclusive (module , & my_peer -> super , offsetof (ompi_osc_rdma_state_t , regions_lock ));
0 commit comments