From b394338ea7039b0e247396fd21a86829f594fbf6 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Thu, 10 Sep 2020 16:54:34 -0400 Subject: [PATCH] kvserver: address migration concern with node liveness In #53842 we introduced a change to always persist a liveness record on start up. As part of that change, we refactored how the liveness heartbeat codepath dealt with missing liveness records: it knew to fetch it from KV given we were now maintaining the invariant that it would always be present. Except that wasn't necessarily true, as demonstrated by the following scenario: ``` // - v20.1 node gets added to v20.1 cluster, and is quickly removed // before being able to persist its liveness record. // - The cluster is upgraded to v20.2. // - The node from earlier is rolled into v20.2, and re-added to the // cluster. // - It's never able to successfully heartbeat (it didn't join // through the join rpc, bootstrap, or gossip). Welp. ``` Though admittedly unlikely, we should handle it all the same instead of simply erroring out. We'll just fall back to creating the liveness record in-place as we did in v20.1 code. We can remove this fallback in 21.1 code. Release note: None --- pkg/kv/kvserver/node_liveness.go | 62 +++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/pkg/kv/kvserver/node_liveness.go b/pkg/kv/kvserver/node_liveness.go index 5a1b688db873..06af2f312940 100644 --- a/pkg/kv/kvserver/node_liveness.go +++ b/pkg/kv/kvserver/node_liveness.go @@ -793,7 +793,12 @@ func (nl *NodeLiveness) heartbeatInternal( } } - if oldLiveness == (kvserverpb.Liveness{}) { + // Let's compute what our new liveness record should be. + var newLiveness kvserverpb.Liveness + if oldLiveness != (kvserverpb.Liveness{}) { + // Start off with our existing view of liveness. + newLiveness = oldLiveness + } else { // We don't yet know about our own liveness record (which does exist, we // maintain the invariant that there's always a liveness record for // every given node). Let's retrieve it from KV before proceeding. @@ -804,25 +809,50 @@ func (nl *NodeLiveness) heartbeatInternal( if err != nil { return errors.Wrap(err, "unable to get liveness") } - if kv.Value == nil { - return ErrNoLivenessRecord - } - if err := kv.Value.GetProto(&oldLiveness); err != nil { - return errors.Wrap(err, "invalid liveness record") - } - oldLivenessRec := LivenessRecord{ - Liveness: oldLiveness, - raw: kv.Value.TagAndDataBytes(), - } + if kv.Value != nil { + // This is the happy path. Let's unpack the liveness record we found + // within KV, and use that to inform what our new liveness should + // be. + if err := kv.Value.GetProto(&oldLiveness); err != nil { + return errors.Wrap(err, "invalid liveness record") + } - // Offer it to make sure that when we actually try to update the - // liveness, the previous view is correct. - nl.maybeUpdate(oldLivenessRec) + oldLivenessRec := LivenessRecord{ + Liveness: oldLiveness, + raw: kv.Value.TagAndDataBytes(), + } + + // Update our cache with the liveness record we just found. + nl.maybeUpdate(oldLivenessRec) + + newLiveness = oldLiveness + } else { + // This is a "should basically never happen" scenario given our + // invariant around always persisting liveness records on node + // startup. But that was a change we added in 20.2. Though unlikely, + // it's possible to get into the following scenario: + // + // - v20.1 node gets added to v20.1 cluster, and is quickly removed + // before being able to persist its liveness record. + // - The cluster is upgraded to v20.2. + // - The node from earlier is rolled into v20.2, and re-added to the + // cluster. + // - It's never able to successfully heartbeat (it didn't join + // through the join rpc, bootstrap, or gossip). Welp. + // + // Given this possibility, we'll just fall back to creating the + // liveness record here as we did in v20.1 code. + // + // TODO(irfansharif): Remove this once v20.2 is cut. + log.Warningf(ctx, "missing liveness record for n%d; falling back to creating it in-place", nodeID) + newLiveness = kvserverpb.Liveness{ + NodeID: nodeID, + Epoch: 0, // incremented to epoch=1 below as needed + } + } } - // Let's compute what our new liveness record should be. - newLiveness := oldLiveness if incrementEpoch { newLiveness.Epoch++ newLiveness.Draining = false // clear draining field