Skip to content

Commit

Permalink
rcache: fix deadlock in multi-threaded environments
Browse files Browse the repository at this point in the history
This commit fixes several bugs in the registration cache code:

 - Fix a programming error in the grdma invalidation function that can
   cause an infinite loop if more than 100 registrations are
   associated with a munmapped region. This happens because the
   mca_rcache_base_vma_find_all function returns the same 100
   registrations on each call. This has been fixed by adding an
   iterate function to the vma tree interface.

 - Always obtain the vma lock when needed. This is required because
   there may be other threads in the system even if
   opal_using_threads() is false. Additionally, since it is safe to do
   so (the vma lock is recursive) the vma interface has been made
   thread safe.

 - Avoid calling free() while holding a lock. This avoids race
   conditions with locks held outside the Open MPI code.

Fixes #1654.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
  • Loading branch information
hjelmn committed May 17, 2016
1 parent 4e21933 commit ab8ed17
Show file tree
Hide file tree
Showing 7 changed files with 345 additions and 202 deletions.
31 changes: 16 additions & 15 deletions opal/mca/btl/vader/btl_vader_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,17 @@ static void mca_btl_vader_endpoint_constructor (mca_btl_vader_endpoint_t *ep)
ep->fifo = NULL;
}

#if OPAL_BTL_VADER_HAVE_XPMEM
static int mca_btl_vader_endpoint_rcache_cleanup (mca_rcache_base_registration_t *reg, void *ctx)
{
mca_rcache_base_vma_module_t *vma_module = (mca_rcache_base_vma_module_t *) ctx;
/* otherwise dereg will fail on assert */
reg->ref_count = 0;
(void) mca_rcache_base_vma_delete (vma_module, reg);
return OPAL_SUCCESS;
}
#endif

static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
{
OBJ_DESTRUCT(&ep->pending_frags);
Expand All @@ -548,21 +559,11 @@ static void mca_btl_vader_endpoint_destructor (mca_btl_vader_endpoint_t *ep)
if (MCA_BTL_VADER_XPMEM == mca_btl_vader_component.single_copy_mechanism) {
if (ep->segment_data.xpmem.vma_module) {
/* clean out the registration cache */
const int nregs = 100;
mca_rcache_base_registration_t *regs[nregs];
int reg_cnt;

do {
reg_cnt = mca_rcache_base_vma_find_all (ep->segment_data.xpmem.vma_module,
0, (size_t) -1, regs, nregs);
for (int i = 0 ; i < reg_cnt ; ++i) {
/* otherwise dereg will fail on assert */
regs[i]->ref_count = 0;
OBJ_RELEASE(regs[i]);
}
} while (reg_cnt == nregs);

ep->segment_data.xpmem.vma_module = NULL;
(void) mca_rcache_base_vma_iterate (ep->segment_data.xpmem.vma_module,
NULL, (size_t) -1,
mca_btl_vader_endpoint_rcache_cleanup,
(void *) ep->segment_data.xpmem.vma_module);
OBJ_RELEASE(ep->segment_data.xpmem.vma_module);
}

if (ep->segment_base) {
Expand Down
10 changes: 9 additions & 1 deletion opal/mca/rcache/base/rcache_base_vma.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* Copyright (c) 2009-2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2013 NVIDIA Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
Expand Down Expand Up @@ -144,6 +144,14 @@ int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module,
return mca_rcache_base_vma_tree_delete (vma_module, reg);
}

int mca_rcache_base_vma_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx)
{
return mca_rcache_base_vma_tree_iterate (vma_module, base, size, callback_fn, ctx);
}

void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size, char *msg)
{
Expand Down
23 changes: 22 additions & 1 deletion opal/mca/rcache/base/rcache_base_vma.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
*
* Copyright (c) 2006 Voltaire. All rights reserved.
* Copyright (c) 2009 IBM Corporation. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
*
* $COPYRIGHT$
Expand All @@ -34,6 +34,7 @@
#include "opal_config.h"
#include "opal/class/opal_list.h"
#include "opal/class/opal_rb_tree.h"
#include "opal/class/opal_lifo.h"

BEGIN_C_DECLS

Expand Down Expand Up @@ -69,6 +70,26 @@ int mca_rcache_base_vma_delete (mca_rcache_base_vma_module_t *vma_module,
void mca_rcache_base_vma_dump_range (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size, char *msg);

/**
* Iterate over registrations in the specified range.
*
* @param[in] vma_module vma tree
* @param[in] base base address of region
* @param[in] size size of region
* @param[in] callback_fn function to call for each matching registration handle
* @param[in] ctx callback context
*
* The callback will be made with the vma lock held. This is a recursive lock so
* it is still safe to call any vma functions on this vma_module. Keep in mind it
* is only safe to call mca_rcache_base_vma_delete() on the supplied registration
* from the callback. The iteration will terminate if the callback returns anything
* other than OPAL_SUCCESS.
*/
int mca_rcache_base_vma_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx);

END_C_DECLS

#endif /* MCA_RCACHE_BASE_VMA_H */
115 changes: 101 additions & 14 deletions opal/mca/rcache/base/rcache_base_vma_tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,12 @@ mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_v
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *item;

opal_mutex_lock (&vma_module->vma_lock);

vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_search);
if (!vma) {
opal_mutex_unlock (&vma_module->vma_lock);
return NULL;
}

Expand All @@ -269,12 +272,18 @@ mca_rcache_base_registration_t *mca_rcache_base_vma_tree_find (mca_rcache_base_v
continue;
}

if(item->reg->bound >= bound)
if(item->reg->bound >= bound) {
opal_mutex_unlock (&vma_module->vma_lock);
return item->reg;
if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST))
}

if(!(item->reg->flags & MCA_RCACHE_FLAGS_PERSIST)) {
break;
}
}

opal_mutex_unlock (&vma_module->vma_lock);

return NULL;
}

Expand All @@ -299,6 +308,8 @@ int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module,
if(opal_list_get_size(&vma_module->vma_list) == 0)
return cnt;

opal_mutex_lock (&vma_module->vma_lock);

do {
mca_rcache_base_vma_item_t *vma;
mca_rcache_base_vma_reg_list_item_t *vma_item;
Expand All @@ -316,25 +327,23 @@ int mca_rcache_base_vma_tree_find_all (mca_rcache_base_vma_module_t *vma_module,
}

OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
if ((vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID) ||
if (vma_item->reg->flags & MCA_RCACHE_FLAGS_INVALID ||
is_reg_in_array (regs, cnt, vma_item->reg)) {
continue;
}
regs[cnt++] = vma_item->reg;
if (cnt == reg_cnt) {
opal_mutex_unlock (&vma_module->vma_lock);
return cnt; /* no space left in the provided array */
}
}

base = (unsigned char *)vma->end + 1;
} while(bound >= base);
} while (bound >= base);

return cnt;
}
opal_mutex_unlock (&vma_module->vma_lock);

static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
{
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
return cnt;
}

static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_module_t *vma_module,
Expand All @@ -343,12 +352,74 @@ static inline void mca_rcache_base_vma_update_byte_count (mca_rcache_base_vma_mo
vma_module->reg_cur_cache_size += nbytes;
}

int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module, unsigned char *base,
size_t size, int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx)
{
unsigned char *bound = base + size - 1;
mca_rcache_base_vma_item_t *vma;
int rc = OPAL_SUCCESS;

if (opal_list_get_size(&vma_module->vma_list) == 0) {
/* nothin to do */
return OPAL_SUCCESS;
}

opal_mutex_lock (&vma_module->vma_lock);

do {
mca_rcache_base_vma_reg_list_item_t *vma_item, *next;
vma = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree, base,
mca_rcache_base_vma_tree_node_compare_closest);

if (NULL == vma) {
/* base is bigger than any registered memory */
break;
}

if (base < (unsigned char *) vma->start) {
base = (unsigned char *) vma->start;
continue;
}

base = (unsigned char *)vma->end + 1;

/* all the registrations in the vma may be deleted by the callback so keep a
* reference until we are done with it. */
OBJ_RETAIN(vma);

OPAL_LIST_FOREACH_SAFE(vma_item, next, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
rc = callback_fn (vma_item->reg, ctx);
if (OPAL_SUCCESS != rc) {
break;
}
}

OBJ_RELEASE(vma);

if (OPAL_SUCCESS != rc) {
break;
}
} while (bound >= base);

opal_mutex_unlock (&vma_module->vma_lock);

return rc;
}

static inline int mca_rcache_base_vma_can_insert (mca_rcache_base_vma_module_t *vma_module, size_t nbytes, size_t limit)
{
return (0 == limit || vma_module->reg_cur_cache_size + nbytes <= limit);
}

int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_registration_t *reg, size_t limit)
{
mca_rcache_base_vma_item_t *i;
uintptr_t begin = (uintptr_t)reg->base, end = (uintptr_t)reg->bound;

opal_mutex_lock (&vma_module->vma_lock);

i = (mca_rcache_base_vma_item_t *) opal_rb_tree_find_with (&vma_module->rb_tree,
(void *) begin, mca_rcache_base_vma_tree_node_compare_closest);

Expand All @@ -373,6 +444,7 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
opal_list_append(&vma_module->vma_list, &vma->super);
begin = vma->end + 1;
mca_rcache_base_vma_add_reg (vma, reg);
opal_mutex_unlock (&vma_module->vma_lock);
return OPAL_SUCCESS;
}

Expand Down Expand Up @@ -434,10 +506,14 @@ int mca_rcache_base_vma_tree_insert (mca_rcache_base_vma_module_t *vma_module,
i = (mca_rcache_base_vma_item_t *) opal_list_get_next (&i->super);
}

opal_mutex_unlock (&vma_module->vma_lock);

return OPAL_SUCCESS;

remove:
mca_rcache_base_vma_tree_delete (vma_module, reg);
opal_mutex_unlock (&vma_module->vma_lock);

return OPAL_ERR_TEMP_OUT_OF_RESOURCE;
}

Expand All @@ -453,17 +529,23 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_registration_t *reg)
{
mca_rcache_base_vma_item_t *vma;
opal_list_t deleted_vmas;

opal_mutex_lock (&vma_module->vma_lock);

vma = (mca_rcache_base_vma_item_t *)
opal_rb_tree_find_with (&vma_module->rb_tree, reg->base,
mca_rcache_base_vma_tree_node_compare_search);

if (!vma) {
opal_mutex_unlock (&vma_module->vma_lock);
return OPAL_ERROR;
}

OBJ_CONSTRUCT(&deleted_vmas, opal_list_t);

while (vma != (mca_rcache_base_vma_item_t *) opal_list_get_end (&vma_module->vma_list)
&& vma->start <= (uintptr_t) reg->bound) {
&& vma->start <= (uintptr_t) reg->bound) {
mca_rcache_base_vma_remove_reg(vma, reg);

if(opal_list_is_empty(&vma->reg_list)) {
Expand All @@ -473,7 +555,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
mca_rcache_base_vma_update_byte_count (vma_module,
vma->start - vma->end - 1);
opal_list_remove_item (&vma_module->vma_list, &vma->super);
OBJ_RELEASE(vma);
opal_list_append (&deleted_vmas, &vma->super);
vma = next;
} else {
int merged;
Expand All @@ -491,7 +573,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
prev->end = vma->end;
opal_list_remove_item(&vma_module->vma_list, &vma->super);
opal_rb_tree_delete(&vma_module->rb_tree, vma);
OBJ_RELEASE(vma);
opal_list_append (&deleted_vmas, &vma->super);
vma = prev;
merged = 1;
}
Expand All @@ -505,7 +587,7 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
vma->end = next->end;
opal_list_remove_item(&vma_module->vma_list, &next->super);
opal_rb_tree_delete(&vma_module->rb_tree, next);
OBJ_RELEASE(next);
opal_list_append (&deleted_vmas, &next->super);
merged = 1;
}
} while (merged);
Expand All @@ -514,6 +596,11 @@ int mca_rcache_base_vma_tree_delete (mca_rcache_base_vma_module_t *vma_module,
}
}

opal_mutex_unlock (&vma_module->vma_lock);

/* actually free vmas now that the lock has been dropped */
OPAL_LIST_DESTRUCT(&deleted_vmas);

return 0;
}

Expand Down Expand Up @@ -558,7 +645,7 @@ void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_modu
OPAL_LIST_FOREACH(vma_item, &vma->reg_list, mca_rcache_base_vma_reg_list_item_t) {
reg = vma_item->reg;
opal_output(0, " reg: base=%p, bound=%p, ref_count=%d, flags=0x%x",
reg->base, reg->bound, reg->ref_count, reg->flags);
(void *) reg->base, (void *) reg->bound, reg->ref_count, reg->flags);
}
base = (unsigned char *)vma->end + 1;
} while (bound >= base);
Expand Down
10 changes: 9 additions & 1 deletion opal/mca/rcache/base/rcache_base_vma_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
* Copyright (c) 2009 IBM Corporation. All rights reserved.
*
* Copyright (c) 2013 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
*
Expand Down Expand Up @@ -106,4 +106,12 @@ void mca_rcache_base_vma_tree_dump_range (mca_rcache_base_vma_module_t *vma_modu
unsigned char *base, size_t size, char *msg);


/*
* Iterate over matching registration handles in the tree.
*/
int mca_rcache_base_vma_tree_iterate (mca_rcache_base_vma_module_t *vma_module,
unsigned char *base, size_t size,
int (*callback_fn) (struct mca_rcache_base_registration_t *, void *),
void *ctx);

#endif /* MCA_RCACHE_BASE_VMA_TREE_H */
2 changes: 1 addition & 1 deletion opal/mca/rcache/grdma/rcache_grdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ struct mca_rcache_grdma_cache_t {
opal_list_item_t super;
char *cache_name;
opal_list_t lru_list;
opal_list_t gc_list;
opal_lifo_t gc_lifo;
mca_rcache_base_vma_module_t *vma_module;
};
typedef struct mca_rcache_grdma_cache_t mca_rcache_grdma_cache_t;
Expand Down
Loading

0 comments on commit ab8ed17

Please sign in to comment.