From fc8256543677652962a22d466c2b80144a672158 Mon Sep 17 00:00:00 2001 From: Yatin Mahajan Date: Thu, 7 Jul 2022 10:02:56 -0600 Subject: [PATCH 1/2] CORTX-33564: Fixed panic in rpc_session_cancelled() due to session state Problem: While session was getting canceled some other thread is trying to terminate that session and moves session state to M0_RPC_SESSION_TERMINATING, which lead to hit assert M0_POST(session->s_sm.sm_state == M0_RPC_SESSION_IDLE) in rpc_session_cancel(). Solution: It is possible that some other thread is trying to terminate the same session while session is getting cancelled, only the IDLE/BUSY sessions are allowed to cancel. Updated pre check to return from m0_rpc_cancel instead of panic/assert. Also replaced M0_POST()/assert with proper debug log. Signed-off-by: Yatin Mahajan --- rpc/session.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/rpc/session.c b/rpc/session.c index 5ba2ef23bec..ddba90cb21d 100644 --- a/rpc/session.c +++ b/rpc/session.c @@ -851,11 +851,14 @@ M0_INTERNAL void m0_rpc_session_cancel(struct m0_rpc_session *session) { struct m0_rpc_item *item; - M0_PRE(session->s_session_id != SESSION_ID_0); - M0_PRE(M0_IN(session_state(session), - (M0_RPC_SESSION_BUSY, M0_RPC_SESSION_IDLE))); - M0_ENTRY("session %p", session); + + M0_PRE(session->s_session_id != SESSION_ID_0); + if (!M0_IN(session_state(session), + (M0_RPC_SESSION_BUSY, M0_RPC_SESSION_IDLE))) { + M0_LEAVE("session %p", session); + return; + } m0_rpc_machine_lock(session->s_conn->c_rpc_machine); if (session->s_cancelled) goto leave_unlock; @@ -868,7 +871,11 @@ M0_INTERNAL void m0_rpc_session_cancel(struct m0_rpc_session *session) leave_unlock: m0_rpc_machine_unlock(session->s_conn->c_rpc_machine); M0_POST(pending_item_tlist_is_empty(&session->s_pending_cache)); - M0_POST(session->s_sm.sm_state == M0_RPC_SESSION_IDLE); + if (!M0_IN(session_state(session), + (M0_RPC_SESSION_BUSY, M0_RPC_SESSION_IDLE))) { + M0_LEAVE("session is already finalised %p", session); + return; + } M0_LEAVE("session %p", session); } From 2b01852da5b3ef8824422b1fa6bec497710cc89d Mon Sep 17 00:00:00 2001 From: Yatin Mahajan Date: Mon, 11 Jul 2022 01:35:05 -0600 Subject: [PATCH 2/2] CORTX-33564: Fixed panic in rpc_session_cancelled() due to session state Problem: While session was getting canceled some other thread is trying to terminate that session and moves session state to M0_RPC_SESSION_TERMINATING, which lead to hit assert M0_POST(session->s_sm.sm_state == M0_RPC_SESSION_IDLE) in rpc_session_cancel(). #5 in m0_arch_panic (c=c@entry=0x7f7276a91b40 <__pctx.14289>, ap=ap@entry=0x7f7268c05ce0) at lib/user_space/uassert.c:131 #6 in m0_panic (ctx=ctx@entry=0x7f7276a91b40 <__pctx.14289>) at lib/assert.c:52 #7 in m0_rpc_session_cancel (session=session@entry=0x56283c7c13d8) at rpc/session.c:863 #8 in m0_rpc_conn_sessions_cancel (conn=conn@entry=0x56283c7c1030) at rpc/conn.c:1333 #9 in rpc_conn__on_service_event_cb (clink=) at rpc/conn.c:1364 #10 in clink_signal (clink=clink@entry=0x56283c7c12c0) at lib/chan.c:135 #11 in chan_signal_nr (chan=chan@entry=0x56283c6a8768, nr=1) at lib/chan.c:154 #12 in m0_chan_broadcast (chan=chan@entry=0x56283c6a8768) at lib/chan.c:174 #13 in ha_state_accept (ignore_same_state=1, note=0x7f7268c06060, confc=0x56283816b028) at ha/note.c:18 Solution: It is possible that some other thread is trying to terminate the same session while session is getting cancelled, only the IDLE/BUSY sessions are allowed to cancel. Updated pre check to return from m0_rpc_cancel instead of panic/assert. Also replaced M0_POST()/assert with proper debug log. Signed-off-by: Yatin Mahajan --- rpc/session.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/rpc/session.c b/rpc/session.c index ddba90cb21d..fc60f8f4019 100644 --- a/rpc/session.c +++ b/rpc/session.c @@ -854,12 +854,13 @@ M0_INTERNAL void m0_rpc_session_cancel(struct m0_rpc_session *session) M0_ENTRY("session %p", session); M0_PRE(session->s_session_id != SESSION_ID_0); + m0_rpc_machine_lock(session->s_conn->c_rpc_machine); if (!M0_IN(session_state(session), (M0_RPC_SESSION_BUSY, M0_RPC_SESSION_IDLE))) { - M0_LEAVE("session %p", session); + M0_LEAVE("session %p state=%d", session, session_state(session)); + m0_rpc_machine_unlock(session->s_conn->c_rpc_machine); return; } - m0_rpc_machine_lock(session->s_conn->c_rpc_machine); if (session->s_cancelled) goto leave_unlock; session->s_cancelled = true; @@ -871,11 +872,6 @@ M0_INTERNAL void m0_rpc_session_cancel(struct m0_rpc_session *session) leave_unlock: m0_rpc_machine_unlock(session->s_conn->c_rpc_machine); M0_POST(pending_item_tlist_is_empty(&session->s_pending_cache)); - if (!M0_IN(session_state(session), - (M0_RPC_SESSION_BUSY, M0_RPC_SESSION_IDLE))) { - M0_LEAVE("session is already finalised %p", session); - return; - } M0_LEAVE("session %p", session); }