Skip to content

Commit

Permalink
acl: use the presence of a management policy in the state store as a …
Browse files Browse the repository at this point in the history
…sign that we already migrated to v2 acls (#9505)

This way we only have to wait for the serf barrier to pass once before
we can upgrade to v2 acls. Without this patch every restart needs to
re-compute the change, and potentially if a stray older node joins after
a migration it might regress back to v1 mode which would be problematic.
  • Loading branch information
rboyer authored Jan 5, 2021
1 parent 2d11fc6 commit 19baf4b
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .changelog/9505.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
acl: use the presence of a management policy in the state store as a sign that we already migrated to v2 acls
```
10 changes: 10 additions & 0 deletions agent/consul/acl_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ func (s *Server) canUpgradeToNewACLs(isLeader bool) bool {
return false
}

// Check to see if we already upgraded the last time we ran by seeing if we
// have a copy of any global management policy stored locally. This should
// always be true because policies always replicate.
_, mgmtPolicy, err := s.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
if err != nil {
s.logger.Warn("Failed to get the builtin global-management policy to check for a completed ACL upgrade; skipping this optimization", "error", err)
} else if mgmtPolicy != nil {
return true
}

if !s.InACLDatacenter() {
foundServers, mode, _ := ServersGetACLMode(s, "", s.config.ACLDatacenter)
if mode != structs.ACLModeEnabled || !foundServers {
Expand Down
94 changes: 94 additions & 0 deletions agent/consul/leader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

"github.com/hashicorp/consul/agent/structs"
tokenStore "github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/sdk/testutil/retry"
Expand Down Expand Up @@ -1272,6 +1273,99 @@ func TestLeader_ACLUpgrade(t *testing.T) {
})
}

func TestLeader_ACLUpgrade_IsStickyEvenIfSerfTagsRegress(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}

t.Parallel()

// We test this by having two datacenters with one server each. They
// initially come up and complete the migration, then we power them both
// off. We leave the primary off permanently, and then we stand up the
// secondary. Hopefully it should transition to ENABLED instead of being
// stuck in LEGACY.

dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc1"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLMasterToken = "root"
})
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()

waitForLeaderEstablishment(t, s1)

dir2, s2 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc2"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLTokenReplication = false
c.ACLReplicationRate = 100
c.ACLReplicationBurst = 100
c.ACLReplicationApplyLimit = 1000000
})
defer os.RemoveAll(dir2)
defer s2.Shutdown()
codec2 := rpcClient(t, s2)
defer codec2.Close()

s2.tokens.UpdateReplicationToken("root", tokenStore.TokenSourceConfig)

testrpc.WaitForLeader(t, s2.RPC, "dc2")
waitForLeaderEstablishment(t, s2)

// Create the WAN link
joinWAN(t, s2, s1)
waitForLeaderEstablishment(t, s1)
waitForLeaderEstablishment(t, s2)

waitForNewACLs(t, s1)
waitForNewACLs(t, s2)
waitForNewACLReplication(t, s2, structs.ACLReplicatePolicies, 1, 0, 0)

// Everybody has the management policy.
retry.Run(t, func(r *retry.R) {
_, policy1, err := s1.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
require.NoError(r, err)
require.NotNil(r, policy1)

_, policy2, err := s2.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
require.NoError(r, err)
require.NotNil(r, policy2)
})

// Shutdown s1 and s2.
s1.Shutdown()
s2.Shutdown()

// Restart just s2

dir2new, s2new := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc2"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLTokenReplication = false
c.ACLReplicationRate = 100
c.ACLReplicationBurst = 100
c.ACLReplicationApplyLimit = 1000000

c.DataDir = s2.config.DataDir
c.NodeName = s2.config.NodeName
c.NodeID = s2.config.NodeID
})
defer os.RemoveAll(dir2new)
defer s2new.Shutdown()

waitForLeaderEstablishment(t, s2new)

// It should be able to transition without connectivity to the primary.
waitForNewACLs(t, s2new)
}

func TestLeader_ConfigEntryBootstrap(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
Expand Down

0 comments on commit 19baf4b

Please sign in to comment.