From 44133593ca847690e9953a56ecb15324c3f46e91 Mon Sep 17 00:00:00 2001 From: Jianxin Xiong Date: Mon, 11 Dec 2017 18:39:39 -0800 Subject: [PATCH] prov/psm2: Fix a deadlock in connection cleanup handler Here is the deadlock scenario: #0 0x00007fed3a439495 in pthread_spin_lock () #1 0x00007fed37ad7cfd in fastlock_acquire () #2 0x00007fed37ad80a4 in psmx2_lock () #3 0x00007fed37ad8361 in psmx2_am_trx_ctxt_handler_ext () #4 0x00007fed37b084e7 in psmx2_am_trx_ctxt_handler_0 () #5 0x00007fed373c08c5 in self_am_short_request () #6 0x00007fed3739bf83 in __psm2_am_request_short () #7 0x00007fed37ad84ee in psmx2_trx_ctxt_disconnect_peers () A lock has been held in psmx2_trx_ctxt_disconnect_peers before psm2_am_request_short is called. While making progress inside this function, the execution is redirected to the AM handler due to the arrival of an incoming disconnection request. The AM handler tries to acquire the same lock that has already been held and reaches a deadlock. Fix by avoid calling psm2_am_request_short while holding the lock. Signed-off-by: Jianxin Xiong --- prov/psm2/src/psmx2_domain.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/prov/psm2/src/psmx2_domain.c b/prov/psm2/src/psmx2_domain.c index d179577596f..c31f723f296 100644 --- a/prov/psm2/src/psmx2_domain.c +++ b/prov/psm2/src/psmx2_domain.c @@ -128,13 +128,21 @@ void psmx2_trx_ctxt_disconnect_peers(struct psmx2_trx_ctxt *trx_ctxt) { struct dlist_entry *item, *tmp; struct psmx2_epaddr_context *peer; + struct dlist_entry peer_list; psm2_amarg_t arg; arg.u32w0 = PSMX2_AM_REQ_TRX_CTXT_DISCONNECT; + /* use local peer_list to avoid blocking while holding the lock */ + dlist_init(&peer_list); psmx2_lock(&trx_ctxt->peer_lock, 2); dlist_foreach_safe(&trx_ctxt->peer_list, item, tmp) { dlist_remove(item); + dlist_insert_before(item, &peer_list); + } + psmx2_unlock(&trx_ctxt->peer_lock, 2); + + dlist_foreach_safe(&peer_list, item, tmp) { peer = container_of(item, struct psmx2_epaddr_context, entry); FI_INFO(&psmx2_prov, FI_LOG_CORE, "epaddr: %p\n", peer->epaddr); psm2_am_request_short(peer->epaddr, PSMX2_AM_TRX_CTXT_HANDLER, @@ -142,7 +150,6 @@ void psmx2_trx_ctxt_disconnect_peers(struct psmx2_trx_ctxt *trx_ctxt) psm2_epaddr_setctxt(peer->epaddr, NULL); free(peer); } - psmx2_unlock(&trx_ctxt->peer_lock, 2); } void psmx2_trx_ctxt_free(struct psmx2_trx_ctxt *trx_ctxt)