Skip to content

Commit ac64b00

Browse files
banksraskchanky
andcommitted
Fix AP upgrade version issue (#27277)
* Fix AP upgrade version issue * add heartbeat logging at trace level * add log to show when heartbeats resume * Test the plumbing * Revert "Test the plumbing" This reverts commit e25fcd8. * Add CHANGELOG * Add plumbing test * Update misleading comment --------- Co-authored-by: Josh Black <raskchanky@gmail.com>
1 parent c78c37b commit ac64b00

File tree

6 files changed

+68
-11
lines changed

6 files changed

+68
-11
lines changed

changelog/27277.txt

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
```release-note:bug
2+
storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes
3+
autopilot to fail to discover new server versions and so not trigger an upgrade.
4+
```

physical/raft/raft.go

+12-6
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"github.com/hashicorp/vault/sdk/logical"
4242
"github.com/hashicorp/vault/sdk/physical"
4343
"github.com/hashicorp/vault/vault/cluster"
44+
"github.com/hashicorp/vault/version"
4445
etcdbolt "go.etcd.io/bbolt"
4546
)
4647

@@ -590,6 +591,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
590591
failGetInTxn: new(uint32),
591592
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled,
592593
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval,
594+
effectiveSDKVersion: version.GetVersion().Version,
593595
}, nil
594596
}
595597

@@ -646,12 +648,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) {
646648
atomic.StoreUint32(b.failGetInTxn, val)
647649
}
648650

649-
func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) {
650-
b.l.Lock()
651-
b.effectiveSDKVersion = sdkVersion
652-
b.l.Unlock()
653-
}
654-
655651
func (b *RaftBackend) RedundancyZone() string {
656652
b.l.RLock()
657653
defer b.l.RUnlock()
@@ -1075,6 +1071,11 @@ type SetupOpts struct {
10751071
// RecoveryModeConfig is the configuration for the raft cluster in recovery
10761072
// mode.
10771073
RecoveryModeConfig *raft.Configuration
1074+
1075+
// EffectiveSDKVersion is typically the version string baked into the binary.
1076+
// We pass it in though because it can be overridden in tests or via ENV in
1077+
// core.
1078+
EffectiveSDKVersion string
10781079
}
10791080

10801081
func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error {
@@ -1118,6 +1119,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
11181119
return errors.New("no local node id configured")
11191120
}
11201121

1122+
if opts.EffectiveSDKVersion != "" {
1123+
// Override the SDK version
1124+
b.effectiveSDKVersion = opts.EffectiveSDKVersion
1125+
}
1126+
11211127
// Setup the raft config
11221128
raftConfig := raft.DefaultConfig()
11231129
if err := b.applyConfigSettings(raftConfig); err != nil {

physical/raft/raft_autopilot.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,10 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
421421
}
422422
followerVersion = leaderVersion
423423
} else {
424-
delete(d.emptyVersionLogs, currentServerID)
424+
if _, ok := d.emptyVersionLogs[currentServerID]; ok {
425+
d.logger.Trace("received non-empty version in heartbeat state. no longer need to fake it", "id", id, "update_version", followerVersion)
426+
delete(d.emptyVersionLogs, currentServerID)
427+
}
425428
}
426429
d.dl.Unlock()
427430

vault/external_tests/raft/raft_autopilot_test.go

+35
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,41 @@ func TestRaft_Autopilot_Disable(t *testing.T) {
4141
require.Nil(t, nil, state)
4242
}
4343

44+
// TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that
45+
// ensures that the default plumbing in Vault core to configure the binary
46+
// version of the raft library is working. Hopefully this will trivially pass
47+
// from now on, however it would have caught a regression in the past!
48+
func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) {
49+
t.Parallel()
50+
51+
coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{
52+
EnableAutopilot: true,
53+
// We need 2 nodes because the code path that regressed was different on a
54+
// standby vs active node so we'd not detect the problem if we only test on
55+
// an active node.
56+
NumCores: 2,
57+
})
58+
59+
// Default options should not set EffectiveSDKVersion(Map) which would defeat
60+
// the point of this test by plumbing versions via config.
61+
require.Nil(t, clusterOpts.EffectiveSDKVersionMap)
62+
require.Empty(t, coreCfg.EffectiveSDKVersion)
63+
64+
c := vault.NewTestCluster(t, coreCfg, &clusterOpts)
65+
defer c.Cleanup()
66+
67+
// Wait for follower to be perf standby (in Ent, in CE it will only wait for
68+
// active). In either Enterprise or CE case, this should pass if we've plumbed
69+
// the version correctly so it's valuable for both.
70+
testhelpers.WaitForActiveNodeAndStandbys(t, c)
71+
for _, core := range c.Cores {
72+
be := core.UnderlyingRawStorage.(*raft.RaftBackend)
73+
require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(),
74+
"expected raft upgrade version to default to Vault version for core %q",
75+
core.NodeID)
76+
}
77+
}
78+
4479
// TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted
4580
// to be voters after the stabilization time has elapsed. Also checks that
4681
// the autopilot state is Healthy once all nodes are available.

vault/raft.go

+4-4
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,10 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) {
149149
raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true))
150150

151151
if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{
152-
TLSKeyring: raftTLS,
153-
ClusterListener: c.getClusterListener(),
154-
StartAsLeader: creating,
152+
TLSKeyring: raftTLS,
153+
ClusterListener: c.getClusterListener(),
154+
StartAsLeader: creating,
155+
EffectiveSDKVersion: c.effectiveSDKVersion,
155156
}); err != nil {
156157
return err
157158
}
@@ -309,7 +310,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error {
309310
}
310311

311312
c.logger.Info("starting raft active node")
312-
raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion)
313313

314314
c.pendingRaftPeers = &sync.Map{}
315315

vault/request_forwarding_rpc.go

+9
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,15 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) (
100100
}
101101

102102
if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil {
103+
s.core.logger.Trace("forwarding RPC: echo received",
104+
"node_id", in.RaftNodeID,
105+
"applied_index", in.RaftAppliedIndex,
106+
"term", in.RaftTerm,
107+
"desired_suffrage", in.RaftDesiredSuffrage,
108+
"sdk_version", in.SdkVersion,
109+
"upgrade_version", in.RaftUpgradeVersion,
110+
"redundancy_zone", in.RaftRedundancyZone)
111+
103112
s.raftFollowerStates.Update(&raft.EchoRequestUpdate{
104113
NodeID: in.RaftNodeID,
105114
AppliedIndex: in.RaftAppliedIndex,

0 commit comments

Comments
 (0)