From d264188c9cfa5fecd0852a7aaa7b822853ec61c5 Mon Sep 17 00:00:00 2001 From: teor Date: Fri, 15 Jan 2021 16:23:19 +1000 Subject: [PATCH 1/4] Stop processing the request after failing an overloaded connection zebra-network's Connection expects that `fail_with` is only called once per connection, but the overload handling code continues to process the current request after an overload error, potentially leading to further failures. Closes #1599 --- zebra-network/src/peer/connection.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/zebra-network/src/peer/connection.rs b/zebra-network/src/peer/connection.rs index 36cdd06173c..d834ce877a6 100644 --- a/zebra-network/src/peer/connection.rs +++ b/zebra-network/src/peer/connection.rs @@ -840,6 +840,7 @@ where if self.svc.ready_and().await.is_err() { // Treat all service readiness errors as Overloaded self.fail_with(PeerError::Overloaded); + return; } let rsp = match self.svc.call(req).await { From a16c5613d76f97978f586c3377d051c08a64a9c0 Mon Sep 17 00:00:00 2001 From: teor Date: Fri, 15 Jan 2021 16:26:34 +1000 Subject: [PATCH 2/4] Stop panicking when fail_with is called twice on a connection We can't rule out the connection state changing between the state checks and any eventual failures, particularly in the presence of async code. So we turn this panic into a warning. --- zebra-network/src/peer/connection.rs | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/zebra-network/src/peer/connection.rs b/zebra-network/src/peer/connection.rs index d834ce877a6..f94090d2998 100644 --- a/zebra-network/src/peer/connection.rs +++ b/zebra-network/src/peer/connection.rs @@ -511,8 +511,32 @@ where .0 .lock() .expect("mutex should be unpoisoned"); - if guard.is_some() { - panic!("called fail_with on already-failed connection state"); + if let Some(original_error) = guard.clone() { + // A failed connection might experience further errors if we: + // 1. concurrently process two different messages + // 2. check for a failed state for the second message + // 3. fail the connection due to the first message + // 4. fail the connection due to the second message + // + // It's not clear: + // * if this is actually a bug, + // * how we can modify Zebra to avoid it. + // + // This warning can also happen due to these bugs: + // * we mark a connection as failed without using fail_with + // * we call fail_with without checking for a failed connection + // state + // + // See the original bug #1510, the initial fix #1531, and the later + // bug #1599. + warn!(?original_error, + new_error = ?e, + connection_state = ?self.state, + client_receiver = ?self.client_rx, + "calling fail_with on already-failed connection state: ignoring new error"); + // we don't need to clean up the connection, the original call to + // fail_with does that + return; } else { *guard = Some(e); } From 73f40caf4130a25d5e20749a0e28cb7875592ef5 Mon Sep 17 00:00:00 2001 From: teor Date: Fri, 15 Jan 2021 16:28:30 +1000 Subject: [PATCH 3/4] Improve logging for connection state errors --- zebra-network/src/peer/connection.rs | 42 ++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/zebra-network/src/peer/connection.rs b/zebra-network/src/peer/connection.rs index f94090d2998..838f25d1d39 100644 --- a/zebra-network/src/peer/connection.rs +++ b/zebra-network/src/peer/connection.rs @@ -423,7 +423,11 @@ where State::AwaitingResponse { ref mut handler, .. } => span.in_scope(|| handler.process_message(peer_msg)), - _ => unreachable!(), + _ => unreachable!("unexpected state after AwaitingResponse: {:?}, peer_msg: {:?}, client_receiver: {:?}", + self.state, + peer_msg, + self.client_rx, + ), }; // If the message was not consumed, check whether it // should be handled as a request. @@ -444,7 +448,10 @@ where State::AwaitingRequest } pending @ State::AwaitingResponse { .. } => pending, - _ => unreachable!(), + _ => unreachable!( + "unexpected failed connection state while AwaitingResponse: client_receiver: {:?}", + self.client_rx + ), }; } } @@ -465,7 +472,10 @@ where let _ = tx.send(Err(e.into())); State::AwaitingRequest } - _ => unreachable!(), + _ => unreachable!( + "unexpected failed connection state while AwaitingResponse: client_receiver: {:?}", + self.client_rx + ), }; } Either::Right((Either::Right(_), _peer_fut)) => { @@ -481,7 +491,7 @@ where Some(InProgressClientRequest { tx, span, .. }) => { trace!( parent: &span, - "erroring pending request to failed connection" + "sending an error response to a pending request on a failed connection" ); let e = self .error_slot @@ -504,7 +514,10 @@ where E: Into, { let e = e.into(); - debug!(%e, "failing peer service with error"); + debug!(%e, + connection_state = ?self.state, + client_receiver = ?self.client_rx, + "failing peer service with error"); // Update the shared error slot let mut guard = self .error_slot @@ -575,9 +588,17 @@ where // These matches return a Result with (new_state, Option) or an (error, Sender) let new_state_result = match (&self.state, request) { - (Failed, _) => panic!("failed connection cannot handle requests"), - (AwaitingResponse { .. }, _) => panic!("tried to update pending request"), - + (Failed, request) => panic!( + "failed connection cannot handle new request: {:?}, client_receiver: {:?}", + request, + self.client_rx + ), + (pending @ AwaitingResponse { .. }, request) => panic!( + "tried to process new request: {:?} while awaiting a response: {:?}, client_receiver: {:?}", + request, + pending, + self.client_rx + ), (AwaitingRequest, Peers) => match self.peer_tx.send(Message::GetAddr).await { Ok(()) => Ok(( AwaitingResponse { @@ -875,7 +896,10 @@ where self.fail_with(PeerError::Overloaded); } else { // We could send a reject to the remote peer. - error!(%e); + error!(%e, + connection_state = ?self.state, + client_receiver = ?self.client_rx, + "error processing peer request"); } return; } From 5d481524dbba3c172bbd385cd0a5d2b50c25814d Mon Sep 17 00:00:00 2001 From: teor Date: Mon, 18 Jan 2021 14:53:23 +1000 Subject: [PATCH 4/4] Revert "Stop panicking when fail_with is called twice on a connection" But keep the extra error information. --- zebra-network/src/peer/connection.rs | 31 +++++++++------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/zebra-network/src/peer/connection.rs b/zebra-network/src/peer/connection.rs index 838f25d1d39..7fd535aa937 100644 --- a/zebra-network/src/peer/connection.rs +++ b/zebra-network/src/peer/connection.rs @@ -525,31 +525,20 @@ where .lock() .expect("mutex should be unpoisoned"); if let Some(original_error) = guard.clone() { - // A failed connection might experience further errors if we: - // 1. concurrently process two different messages - // 2. check for a failed state for the second message - // 3. fail the connection due to the first message - // 4. fail the connection due to the second message - // - // It's not clear: - // * if this is actually a bug, - // * how we can modify Zebra to avoid it. - // - // This warning can also happen due to these bugs: + // This panic typically happens due to these bugs: // * we mark a connection as failed without using fail_with // * we call fail_with without checking for a failed connection // state // - // See the original bug #1510, the initial fix #1531, and the later - // bug #1599. - warn!(?original_error, - new_error = ?e, - connection_state = ?self.state, - client_receiver = ?self.client_rx, - "calling fail_with on already-failed connection state: ignoring new error"); - // we don't need to clean up the connection, the original call to - // fail_with does that - return; + // See the original bug #1510 and PR #1531, and the later bug #1599 + // and PR #1600. + panic!( + "calling fail_with on already-failed connection state: failed connections must stop processing pending requests and responses, then close the connection. state: {:?} original error: {:?} new error: {:?} client receiver: {:?}", + self.state, + original_error, + e, + self.client_rx + ); } else { *guard = Some(e); }