@@ -23,6 +23,7 @@ import (
2323 "github.com/algorand/go-deadlock"
2424
2525 "github.com/algorand/go-algorand/crypto"
26+ "github.com/algorand/go-algorand/logging"
2627)
2728
2829//msgp:ignore pmStage
@@ -384,3 +385,127 @@ func (pm *connectionPerformanceMonitor) accumulateMessage(msg *IncomingMessage,
384385 delete (msgBucket .messages , msgDigest )
385386 }
386387}
388+
389+ type networkAdvanceMonitor struct {
390+ // lastNetworkAdvance contains the last timestamp where the agreement protocol was able to make a notable progress.
391+ // it used as a watchdog to help us detect connectivity issues ( such as cliques )
392+ lastNetworkAdvance time.Time
393+
394+ mu deadlock.Mutex
395+ }
396+
397+ func makeNetworkAdvanceMonitor () * networkAdvanceMonitor {
398+ return & networkAdvanceMonitor {
399+ lastNetworkAdvance : time .Now ().UTC (),
400+ }
401+ }
402+
403+ func (m * networkAdvanceMonitor ) lastAdvancedWithin (interval time.Duration ) bool {
404+ m .mu .Lock ()
405+ defer m .mu .Unlock ()
406+ // now < last + interval <=> now - last < interval
407+ return time .Now ().UTC ().Before (m .lastNetworkAdvance .Add (interval ))
408+ }
409+
410+ func (m * networkAdvanceMonitor ) updateLastAdvance () {
411+ m .mu .Lock ()
412+ defer m .mu .Unlock ()
413+ m .lastNetworkAdvance = time .Now ().UTC ()
414+ }
415+
416+ type outgoingConnsCloser struct {
417+ log logging.Logger
418+ net outgoingDisconnectable
419+ cliqueResolveInterval time.Duration
420+ connPerfMonitor * connectionPerformanceMonitor
421+ netAdvMonitor * networkAdvanceMonitor
422+ }
423+
424+ type outgoingDisconnectable interface {
425+ outgoingPeers () (peers []Peer )
426+ numOutgoingPending () int
427+ disconnect (badnode Peer , reason disconnectReason )
428+ OnNetworkAdvance ()
429+ }
430+
431+ func makeOutgoingConnsCloser (log logging.Logger , net outgoingDisconnectable , connPerfMonitor * connectionPerformanceMonitor , cliqueResolveInterval time.Duration ) * outgoingConnsCloser {
432+ return & outgoingConnsCloser {
433+ log : log ,
434+ net : net ,
435+ cliqueResolveInterval : cliqueResolveInterval ,
436+ connPerfMonitor : connPerfMonitor ,
437+ netAdvMonitor : makeNetworkAdvanceMonitor (),
438+ }
439+ }
440+
441+ // checkExistingConnectionsNeedDisconnecting check to see if existing connection need to be dropped due to
442+ // performance issues and/or network being stalled.
443+ func (cc * outgoingConnsCloser ) checkExistingConnectionsNeedDisconnecting (targetConnCount int ) bool {
444+ // we already connected ( or connecting.. ) to GossipFanout peers.
445+ // get the actual peers.
446+ outgoingPeers := cc .net .outgoingPeers ()
447+ if len (outgoingPeers ) < targetConnCount {
448+ // reset the performance monitor.
449+ cc .connPerfMonitor .Reset ([]Peer {})
450+ return cc .checkNetworkAdvanceDisconnect ()
451+ }
452+
453+ if ! cc .connPerfMonitor .ComparePeers (outgoingPeers ) {
454+ // different set of peers. restart monitoring.
455+ cc .connPerfMonitor .Reset (outgoingPeers )
456+ }
457+
458+ // same set of peers.
459+ peerStat := cc .connPerfMonitor .GetPeersStatistics ()
460+ if peerStat == nil {
461+ // performance metrics are not yet ready.
462+ return cc .checkNetworkAdvanceDisconnect ()
463+ }
464+
465+ // update peers with the performance metrics we've gathered.
466+ var leastPerformingPeer * wsPeer = nil
467+ for _ , stat := range peerStat .peerStatistics {
468+ wsPeer := stat .peer .(* wsPeer )
469+ wsPeer .peerMessageDelay = stat .peerDelay
470+ cc .log .Infof ("network performance monitor - peer '%s' delay %d first message portion %d%%" , wsPeer .GetAddress (), stat .peerDelay , int (stat .peerFirstMessage * 100 ))
471+ if wsPeer .throttledOutgoingConnection && leastPerformingPeer == nil {
472+ leastPerformingPeer = wsPeer
473+ }
474+ }
475+ if leastPerformingPeer == nil {
476+ return cc .checkNetworkAdvanceDisconnect ()
477+ }
478+ cc .net .disconnect (leastPerformingPeer , disconnectLeastPerformingPeer )
479+ cc .connPerfMonitor .Reset ([]Peer {})
480+
481+ return true
482+ }
483+
484+ // checkNetworkAdvanceDisconnect is using the lastNetworkAdvance indicator to see if the network is currently "stuck".
485+ // if it's seems to be "stuck", a randomly picked peer would be disconnected.
486+ func (cc * outgoingConnsCloser ) checkNetworkAdvanceDisconnect () bool {
487+ if cc .netAdvMonitor .lastAdvancedWithin (cc .cliqueResolveInterval ) {
488+ return false
489+ }
490+ outgoingPeers := cc .net .outgoingPeers ()
491+ if len (outgoingPeers ) == 0 {
492+ return false
493+ }
494+ if cc .net .numOutgoingPending () > 0 {
495+ // we're currently trying to extend the list of outgoing connections. no need to
496+ // disconnect any existing connection to free up room for another connection.
497+ return false
498+ }
499+ var peer * wsPeer
500+ disconnectPeerIdx := crypto .RandUint63 () % uint64 (len (outgoingPeers ))
501+ peer = outgoingPeers [disconnectPeerIdx ].(* wsPeer )
502+
503+ cc .net .disconnect (peer , disconnectCliqueResolve )
504+ cc .connPerfMonitor .Reset ([]Peer {})
505+ cc .net .OnNetworkAdvance ()
506+ return true
507+ }
508+
509+ func (cc * outgoingConnsCloser ) updateLastAdvance () {
510+ cc .netAdvMonitor .updateLastAdvance ()
511+ }
0 commit comments