From bfb050c9ccda64728fb9f751aa52c15ed2b44208 Mon Sep 17 00:00:00 2001 From: Matthias Radestock Date: Mon, 9 Nov 2015 14:58:33 +0000 Subject: [PATCH] better convergence when peers restart Previously when a peer restarted, information about the new incarnation (i.e. with a different UID) was not accepted by other peers (and connections would be dropped) unless all knowledge of the previous incarnation had been purged. This could result in a lot of connection churn and hence connectivity disruption, and, in some pathological cases, very slow convergence and hence acceptance of the new incarnation into the network. We now no longer drop connections when encountering different incarnations of a peer. There are two situations when that can happen: 1) on connection establishment we simply proceed 2) on receipt of gossip to ensure convergence we a) treat the UID as an additional discriminator when deciding whether we should update our information about a peer with that which was gossiped. Specifically, we update the information we hold when a) the gossiped version is greater, or b) is the same and the UID is greater. b) include the UID in the information we update c) move our own version number beyond any we receive for ourselves, if the received UID differs from ours. With (a) we establishes a total order of peer information across several incarnations of the same peer. i.e. we consider information to be fresher if it has a higher version, or the same version and higher UID. This may seem somewhat counter intutive, since it will generally treat information about new incarnations as older than old incarnations, since incarnations always start life with version 1. But to do better we'd need to establish a total order of incarnations that matches their temporal occurrence. Which requires some sort of durable state. So instead we have (c). Through that we learn the highest version number of any old incarnation of ourselves that other peers still hold, and then make sure that our version is greater than that. Essentially we continue where the old incarnations left off. It's as if instead of restarting we had simply changed UIDs. And due to (a) and (b) the information about the new incarnation of ourselves, now with a higher version, will supersede that of the old incarnations. Fixes #1554. --- router/connection.go | 4 --- router/local_peer.go | 10 ++++++ router/peers.go | 76 ++++++++++++++++++++------------------------ 3 files changed, 44 insertions(+), 46 deletions(-) diff --git a/router/connection.go b/router/connection.go index 7a1f878f4b..77a539ccb5 100644 --- a/router/connection.go +++ b/router/connection.go @@ -329,10 +329,6 @@ func (conn *LocalConnection) registerRemote(remote *Peer, acceptNewPeer bool) er } } - if conn.remote.UID != remote.UID { - return fmt.Errorf("Connection appears to be with different version of a peer we already know of") - } - if conn.remote == conn.local { return ErrConnectToSelf } diff --git a/router/local_peer.go b/router/local_peer.go index 80c5abcc22..ab54745a87 100644 --- a/router/local_peer.go +++ b/router/local_peer.go @@ -249,3 +249,13 @@ func (peer *LocalPeer) setShortID(shortID PeerShortID) { peer.ShortID = shortID peer.Version++ } + +func (peer *LocalPeer) setVersionBeyond(version uint64) bool { + peer.Lock() + defer peer.Unlock() + if version >= peer.Version { + peer.Version = version + 1 + return true + } + return false +} diff --git a/router/peers.go b/router/peers.go index 6082c754a7..9c30a4e292 100644 --- a/router/peers.go +++ b/router/peers.go @@ -40,14 +40,6 @@ func (upe UnknownPeerError) Error() string { return fmt.Sprint("Reference to unknown peer ", upe.Name) } -type NameCollisionError struct { - Name PeerName -} - -func (nce NameCollisionError) Error() string { - return fmt.Sprint("Multiple peers found with same name: ", nce.Name) -} - type PeerNameSet map[PeerName]struct{} type ConnectionSummary struct { @@ -68,6 +60,9 @@ type PeersPendingNotifications struct { // The local short ID needs reassigning due to a collision reassignLocalShortID bool + + // The local peer was modified + localPeerModified bool } func NewPeers(ourself *LocalPeer) *Peers { @@ -100,7 +95,8 @@ func (peers *Peers) OnInvalidateShortIDs(callback func()) { } func (peers *Peers) unlockAndNotify(pending *PeersPendingNotifications) { - broadcastLocalPeer := pending.reassignLocalShortID && peers.reassignLocalShortID(pending) + broadcastLocalPeer := (pending.reassignLocalShortID && peers.reassignLocalShortID(pending)) || + pending.localPeerModified onGC := peers.onGC onInvalidateShortIDs := peers.onInvalidateShortIDs peers.Unlock() @@ -436,12 +432,8 @@ func (peers *Peers) decodeUpdate(update []byte) (newPeers map[PeerName]*Peer, de newPeer := NewPeerFromSummary(peerSummary) decodedUpdate = append(decodedUpdate, newPeer) decodedConns = append(decodedConns, connSummaries) - existingPeer, found := peers.byName[newPeer.Name] - if !found { + if _, found := peers.byName[newPeer.Name]; !found { newPeers[newPeer.Name] = newPeer - } else if existingPeer.UID != newPeer.UID { - err = NameCollisionError{Name: newPeer.Name} - return } } @@ -469,36 +461,36 @@ func (peers *Peers) applyUpdate(decodedUpdate []*Peer, decodedConns [][]Connecti connSummaries := decodedConns[idx] name := newPeer.Name // guaranteed to find peer in the peers.byName - peer := peers.byName[name] - if peer != newPeer && - (peer == peers.ourself.Peer || peer.Version >= newPeer.Version) { - // Nobody but us updates us. And if we know more about a - // peer than what's in the the update, we ignore the - // latter. - continue - } - // If we're here, either it was a new peer, or the update has - // more info about the peer than we do. Either case, we need - // to set version and conns and include the updated peer in - // the outgoing update. - - // Can peer have been updated by anyone else in the mean time? - // No - we know that peer is not ourself, so the only prospect - // for an update would be someone else calling - // router.Peers.ApplyUpdate. But ApplyUpdate takes the Lock on - // the router.Peers, so there can be no race here. - peer.Version = newPeer.Version - peer.connections = makeConnsMap(peer, connSummaries, peers.byName) - - if newPeer.ShortID != peer.ShortID { - peers.deleteByShortID(peer, pending) - peer.ShortID = newPeer.ShortID - peers.addByShortID(peer, pending) + switch peer := peers.byName[name]; peer { + case peers.ourself.Peer: + if newPeer.UID != peer.UID { + // The update contains information about an old + // incarnation of ourselves. We increase our version + // number beyond that which we received, so our + // information supersedes the old one when it is + // received by other peers. + pending.localPeerModified = peers.ourself.setVersionBeyond(newPeer.Version) + } + case newPeer: + peer.connections = makeConnsMap(peer, connSummaries, peers.byName) + newUpdate[name] = void + default: // existing peer + if newPeer.Version < peer.Version || + (newPeer.Version == peer.Version && newPeer.UID <= peer.UID) { + continue + } + peer.Version = newPeer.Version + peer.UID = newPeer.UID + peer.connections = makeConnsMap(peer, connSummaries, peers.byName) + + if newPeer.ShortID != peer.ShortID { + peers.deleteByShortID(peer, pending) + peer.ShortID = newPeer.ShortID + peers.addByShortID(peer, pending) + } + newUpdate[name] = void } - - newUpdate[name] = void } - return newUpdate }