Skip to content

Commit d3fa1bb

Browse files
committed
rcache/grdma: try to prevent erroneous free error messages
It is possible to have parts of an in-use registered region be passed to munmap or madvise. This does not necessarily mean the user has made an error but does mean the entire region should be invalidated. This commit checks that the munmap or madvise base matches the beginning of the cached region. If it does and the region is in-use then we print an error. There will certainly be false-negatives where a user unmaps something that really is in-use but that is preferrable to a false-positive. References #4509 Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
1 parent 1e630f4 commit d3fa1bb

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

opal/mca/rcache/grdma/rcache_grdma_module.c

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* Copyright (c) 2006 Voltaire. All rights reserved.
1515
* Copyright (c) 2007 Mellanox Technologies. All rights reserved.
1616
* Copyright (c) 2010 IBM Corporation. All rights reserved.
17-
* Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
17+
* Copyright (c) 2011-2017 Los Alamos National Security, LLC. All rights
1818
* reserved.
1919
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
2020
* Copyright (c) 2016 Research Organization for Information Science
@@ -422,20 +422,29 @@ static int mca_rcache_grdma_deregister (mca_rcache_base_module_t *rcache,
422422
return rc;
423423
}
424424

425+
struct gc_add_args_t {
426+
void *base;
427+
size_t size;
428+
};
429+
typedef struct gc_add_args_t gc_add_args_t;
430+
425431
static int gc_add (mca_rcache_base_registration_t *grdma_reg, void *ctx)
426432
{
427433
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) grdma_reg->rcache;
428-
429-
/* unused */
430-
(void) ctx;
434+
gc_add_args_t *args = (gc_add_args_t *) ctx;
431435

432436
if (grdma_reg->flags & MCA_RCACHE_FLAGS_INVALID) {
433437
/* nothing more to do */
434438
return OPAL_SUCCESS;
435439
}
436440

437-
if (grdma_reg->ref_count) {
438-
/* attempted to remove an active registration */
441+
if (grdma_reg->ref_count && grdma_reg->base == args->base) {
442+
/* attempted to remove an active registration. to handle cases where part of
443+
* an active registration has been unmapped we check if the bases match. this
444+
* *hopefully* will suppress erroneously emitted errors. if we can't suppress
445+
* the erroneous error in all cases then this check and return should be removed
446+
* entirely. we are not required to give an error for a user freeing a buffer
447+
* that is in-use by MPI. Its just a nice to have. */
439448
return OPAL_ERROR;
440449
}
441450

@@ -457,7 +466,8 @@ static int mca_rcache_grdma_invalidate_range (mca_rcache_base_module_t *rcache,
457466
void *base, size_t size)
458467
{
459468
mca_rcache_grdma_module_t *rcache_grdma = (mca_rcache_grdma_module_t *) rcache;
460-
return mca_rcache_base_vma_iterate (rcache_grdma->cache->vma_module, base, size, gc_add, NULL);
469+
gc_add_args_t args = {.base = base, .size = size};
470+
return mca_rcache_base_vma_iterate (rcache_grdma->cache->vma_module, base, size, gc_add, &args);
461471
}
462472

463473
/* Make sure this registration request is not stale. In other words, ensure

0 commit comments

Comments
 (0)