Skip to content

Commit f1062eb

Browse files
banksraskchanky
andcommitted
Fix AP upgrade version issue (#27277)
* Fix AP upgrade version issue * add heartbeat logging at trace level * add log to show when heartbeats resume * Test the plumbing * Revert "Test the plumbing" This reverts commit e25fcd8. * Add CHANGELOG * Add plumbing test * Update misleading comment --------- Co-authored-by: Josh Black <raskchanky@gmail.com>
1 parent c08825e commit f1062eb

File tree

6 files changed

+71
-11
lines changed

6 files changed

+71
-11
lines changed

changelog/27277.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
```release-note:bug
2+
storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes
3+
autopilot to fail to discover new server versions and so not trigger an upgrade.
4+
```

physical/raft/raft.go

+12-6
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"github.com/hashicorp/vault/sdk/logical"
3838
"github.com/hashicorp/vault/sdk/physical"
3939
"github.com/hashicorp/vault/vault/cluster"
40+
"github.com/hashicorp/vault/version"
4041
etcdbolt "go.etcd.io/bbolt"
4142
)
4243

@@ -520,6 +521,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
520521
nonVoter: nonVoter,
521522
upgradeVersion: upgradeVersion,
522523
failGetInTxn: new(uint32),
524+
effectiveSDKVersion: version.GetVersion().Version,
523525
}, nil
524526
}
525527

@@ -579,12 +581,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) {
579581
atomic.StoreUint32(b.failGetInTxn, val)
580582
}
581583

582-
func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) {
583-
b.l.Lock()
584-
b.effectiveSDKVersion = sdkVersion
585-
b.l.Unlock()
586-
}
587-
588584
func (b *RaftBackend) RedundancyZone() string {
589585
b.l.RLock()
590586
defer b.l.RUnlock()
@@ -877,6 +873,11 @@ type SetupOpts struct {
877873
// RecoveryModeConfig is the configuration for the raft cluster in recovery
878874
// mode.
879875
RecoveryModeConfig *raft.Configuration
876+
877+
// EffectiveSDKVersion is typically the version string baked into the binary.
878+
// We pass it in though because it can be overridden in tests or via ENV in
879+
// core.
880+
EffectiveSDKVersion string
880881
}
881882

882883
func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error {
@@ -920,6 +921,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
920921
return errors.New("no local node id configured")
921922
}
922923

924+
if opts.EffectiveSDKVersion != "" {
925+
// Override the SDK version
926+
b.effectiveSDKVersion = opts.EffectiveSDKVersion
927+
}
928+
923929
// Setup the raft config
924930
raftConfig := raft.DefaultConfig()
925931
if err := b.applyConfigSettings(raftConfig); err != nil {

physical/raft/raft_autopilot.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,10 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
421421
}
422422
followerVersion = leaderVersion
423423
} else {
424-
delete(d.emptyVersionLogs, currentServerID)
424+
if _, ok := d.emptyVersionLogs[currentServerID]; ok {
425+
d.logger.Trace("received non-empty version in heartbeat state. no longer need to fake it", "id", id, "update_version", followerVersion)
426+
delete(d.emptyVersionLogs, currentServerID)
427+
}
425428
}
426429
d.dl.Unlock()
427430

vault/external_tests/raft/raft_autopilot_test.go

+38
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,44 @@ func TestRaft_Autopilot_Disable(t *testing.T) {
4444
require.Nil(t, nil, state)
4545
}
4646

47+
// TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that
48+
// ensures that the default plumbing in Vault core to configure the binary
49+
// version of the raft library is working. Hopefully this will trivially pass
50+
// from now on, however it would have caught a regression in the past!
51+
func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) {
52+
t.Parallel()
53+
54+
coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{
55+
EnableAutopilot: true,
56+
// We need 2 nodes because the code path that regressed was different on a
57+
// standby vs active node so we'd not detect the problem if we only test on
58+
// an active node.
59+
NumCores: 2,
60+
})
61+
62+
// Default options should not set EffectiveSDKVersion(Map) which would defeat
63+
// the point of this test by plumbing versions via config.
64+
require.Nil(t, clusterOpts.EffectiveSDKVersionMap)
65+
require.Empty(t, coreCfg.EffectiveSDKVersion)
66+
67+
c := vault.NewTestCluster(t, coreCfg, &clusterOpts)
68+
defer c.Cleanup()
69+
70+
// Wait for follower to be perf standby (in Ent, in CE it will only wait for
71+
// active). In either Enterprise or CE case, this should pass if we've plumbed
72+
// the version correctly so it's valuable for both.
73+
testhelpers.WaitForActiveNodeAndStandbys(t, c)
74+
for _, core := range c.Cores {
75+
be := core.UnderlyingRawStorage.(*raft.RaftBackend)
76+
require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(),
77+
"expected raft upgrade version to default to Vault version for core %q",
78+
core.NodeID)
79+
}
80+
}
81+
82+
// TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted
83+
// to be voters after the stabilization time has elapsed. Also checks that
84+
// the autopilot state is Healthy once all nodes are available.
4785
func TestRaft_Autopilot_Stabilization_And_State(t *testing.T) {
4886
t.Parallel()
4987
cluster, _ := raftCluster(t, &RaftClusterOpts{

vault/raft.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,10 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) {
152152
raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true))
153153

154154
if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{
155-
TLSKeyring: raftTLS,
156-
ClusterListener: c.getClusterListener(),
157-
StartAsLeader: creating,
155+
TLSKeyring: raftTLS,
156+
ClusterListener: c.getClusterListener(),
157+
StartAsLeader: creating,
158+
EffectiveSDKVersion: c.effectiveSDKVersion,
158159
}); err != nil {
159160
return err
160161
}
@@ -312,7 +313,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error {
312313
}
313314

314315
c.logger.Info("starting raft active node")
315-
raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion)
316316

317317
c.pendingRaftPeers = &sync.Map{}
318318

vault/request_forwarding_rpc.go

+9
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,15 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) (
9292
}
9393

9494
if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil {
95+
s.core.logger.Trace("forwarding RPC: echo received",
96+
"node_id", in.RaftNodeID,
97+
"applied_index", in.RaftAppliedIndex,
98+
"term", in.RaftTerm,
99+
"desired_suffrage", in.RaftDesiredSuffrage,
100+
"sdk_version", in.SdkVersion,
101+
"upgrade_version", in.RaftUpgradeVersion,
102+
"redundancy_zone", in.RaftRedundancyZone)
103+
95104
s.raftFollowerStates.Update(&raft.EchoRequestUpdate{
96105
NodeID: in.RaftNodeID,
97106
AppliedIndex: in.RaftAppliedIndex,

0 commit comments

Comments
 (0)