From d264188c9cfa5fecd0852a7aaa7b822853ec61c5 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Fri, 15 Jan 2021 16:23:19 +1000
Subject: [PATCH 1/4] Stop processing the request after failing an overloaded
 connection

zebra-network's Connection expects that `fail_with` is only called once
per connection, but the overload handling code continues to process the
current request after an overload error, potentially leading to further
failures.

Closes #1599
---
 zebra-network/src/peer/connection.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/zebra-network/src/peer/connection.rs b/zebra-network/src/peer/connection.rs
index 36cdd06173c..d834ce877a6 100644
--- a/zebra-network/src/peer/connection.rs
+++ b/zebra-network/src/peer/connection.rs
@@ -840,6 +840,7 @@ where
         if self.svc.ready_and().await.is_err() {
             // Treat all service readiness errors as Overloaded
             self.fail_with(PeerError::Overloaded);
+            return;
         }
 
         let rsp = match self.svc.call(req).await {

From a16c5613d76f97978f586c3377d051c08a64a9c0 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Fri, 15 Jan 2021 16:26:34 +1000
Subject: [PATCH 2/4] Stop panicking when fail_with is called twice on a
 connection

We can't rule out the connection state changing between the state checks
and any eventual failures, particularly in the presence of async code.

So we turn this panic into a warning.
---
 zebra-network/src/peer/connection.rs | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/zebra-network/src/peer/connection.rs b/zebra-network/src/peer/connection.rs
index d834ce877a6..f94090d2998 100644
--- a/zebra-network/src/peer/connection.rs
+++ b/zebra-network/src/peer/connection.rs
@@ -511,8 +511,32 @@ where
             .0
             .lock()
             .expect("mutex should be unpoisoned");
-        if guard.is_some() {
-            panic!("called fail_with on already-failed connection state");
+        if let Some(original_error) = guard.clone() {
+            // A failed connection might experience further errors if we:
+            // 1. concurrently process two different messages
+            // 2. check for a failed state for the second message
+            // 3. fail the connection due to the first message
+            // 4. fail the connection due to the second message
+            //
+            // It's not clear:
+            // * if this is actually a bug,
+            // * how we can modify Zebra to avoid it.
+            //
+            // This warning can also happen due to these bugs:
+            // * we mark a connection as failed without using fail_with
+            // * we call fail_with without checking for a failed connection
+            //   state
+            //
+            // See the original bug #1510, the initial fix #1531, and the later
+            // bug #1599.
+            warn!(?original_error,
+                  new_error = ?e,
+                  connection_state = ?self.state,
+                  client_receiver = ?self.client_rx,
+                  "calling fail_with on already-failed connection state: ignoring new error");
+            // we don't need to clean up the connection, the original call to
+            // fail_with does that
+            return;
         } else {
             *guard = Some(e);
         }

From 73f40caf4130a25d5e20749a0e28cb7875592ef5 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Fri, 15 Jan 2021 16:28:30 +1000
Subject: [PATCH 3/4] Improve logging for connection state errors

---
 zebra-network/src/peer/connection.rs | 42 ++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/zebra-network/src/peer/connection.rs b/zebra-network/src/peer/connection.rs
index f94090d2998..838f25d1d39 100644
--- a/zebra-network/src/peer/connection.rs
+++ b/zebra-network/src/peer/connection.rs
@@ -423,7 +423,11 @@ where
                                 State::AwaitingResponse {
                                     ref mut handler, ..
                                 } => span.in_scope(|| handler.process_message(peer_msg)),
-                                _ => unreachable!(),
+                                _ => unreachable!("unexpected state after AwaitingResponse: {:?}, peer_msg: {:?}, client_receiver: {:?}",
+                                                  self.state,
+                                                  peer_msg,
+                                                  self.client_rx,
+                                ),
                             };
                             // If the message was not consumed, check whether it
                             // should be handled as a request.
@@ -444,7 +448,10 @@ where
                                         State::AwaitingRequest
                                     }
                                     pending @ State::AwaitingResponse { .. } => pending,
-                                    _ => unreachable!(),
+                                    _ => unreachable!(
+                                        "unexpected failed connection state while AwaitingResponse: client_receiver: {:?}",
+                                        self.client_rx
+                                    ),
                                 };
                             }
                         }
@@ -465,7 +472,10 @@ where
                                     let _ = tx.send(Err(e.into()));
                                     State::AwaitingRequest
                                 }
-                                _ => unreachable!(),
+                                _ => unreachable!(
+                                    "unexpected failed connection state while AwaitingResponse: client_receiver: {:?}",
+                                    self.client_rx
+                                ),
                             };
                         }
                         Either::Right((Either::Right(_), _peer_fut)) => {
@@ -481,7 +491,7 @@ where
                         Some(InProgressClientRequest { tx, span, .. }) => {
                             trace!(
                                 parent: &span,
-                                "erroring pending request to failed connection"
+                                "sending an error response to a pending request on a failed connection"
                             );
                             let e = self
                                 .error_slot
@@ -504,7 +514,10 @@ where
         E: Into<SharedPeerError>,
     {
         let e = e.into();
-        debug!(%e, "failing peer service with error");
+        debug!(%e,
+               connection_state = ?self.state,
+               client_receiver = ?self.client_rx,
+               "failing peer service with error");
         // Update the shared error slot
         let mut guard = self
             .error_slot
@@ -575,9 +588,17 @@ where
 
         // These matches return a Result with (new_state, Option<Sender>) or an (error, Sender)
         let new_state_result = match (&self.state, request) {
-            (Failed, _) => panic!("failed connection cannot handle requests"),
-            (AwaitingResponse { .. }, _) => panic!("tried to update pending request"),
-
+            (Failed, request) => panic!(
+                "failed connection cannot handle new request: {:?}, client_receiver: {:?}",
+                request,
+                self.client_rx
+            ),
+            (pending @ AwaitingResponse { .. }, request) => panic!(
+                "tried to process new request: {:?} while awaiting a response: {:?}, client_receiver: {:?}",
+                request,
+                pending,
+                self.client_rx
+            ),
             (AwaitingRequest, Peers) => match self.peer_tx.send(Message::GetAddr).await {
                 Ok(()) => Ok((
                     AwaitingResponse {
@@ -875,7 +896,10 @@ where
                     self.fail_with(PeerError::Overloaded);
                 } else {
                     // We could send a reject to the remote peer.
-                    error!(%e);
+                    error!(%e,
+                           connection_state = ?self.state,
+                           client_receiver = ?self.client_rx,
+                           "error processing peer request");
                 }
                 return;
             }

From 5d481524dbba3c172bbd385cd0a5d2b50c25814d Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Mon, 18 Jan 2021 14:53:23 +1000
Subject: [PATCH 4/4] Revert "Stop panicking when fail_with is called twice on
 a connection"

But keep the extra error information.
---
 zebra-network/src/peer/connection.rs | 31 +++++++++-------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/zebra-network/src/peer/connection.rs b/zebra-network/src/peer/connection.rs
index 838f25d1d39..7fd535aa937 100644
--- a/zebra-network/src/peer/connection.rs
+++ b/zebra-network/src/peer/connection.rs
@@ -525,31 +525,20 @@ where
             .lock()
             .expect("mutex should be unpoisoned");
         if let Some(original_error) = guard.clone() {
-            // A failed connection might experience further errors if we:
-            // 1. concurrently process two different messages
-            // 2. check for a failed state for the second message
-            // 3. fail the connection due to the first message
-            // 4. fail the connection due to the second message
-            //
-            // It's not clear:
-            // * if this is actually a bug,
-            // * how we can modify Zebra to avoid it.
-            //
-            // This warning can also happen due to these bugs:
+            // This panic typically happens due to these bugs:
             // * we mark a connection as failed without using fail_with
             // * we call fail_with without checking for a failed connection
             //   state
             //
-            // See the original bug #1510, the initial fix #1531, and the later
-            // bug #1599.
-            warn!(?original_error,
-                  new_error = ?e,
-                  connection_state = ?self.state,
-                  client_receiver = ?self.client_rx,
-                  "calling fail_with on already-failed connection state: ignoring new error");
-            // we don't need to clean up the connection, the original call to
-            // fail_with does that
-            return;
+            // See the original bug #1510 and PR #1531, and the later bug #1599
+            // and PR #1600.
+            panic!(
+                "calling fail_with on already-failed connection state: failed connections must stop processing pending requests and responses, then close the connection. state: {:?} original error: {:?} new error: {:?} client receiver: {:?}",
+                self.state,
+                original_error,
+                e,
+                self.client_rx
+            );
         } else {
             *guard = Some(e);
         }