Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

acl: use the presence of a management policy in the state store as a sign that we already migrated to v2 acls #9505

Merged
merged 3 commits into from
Jan 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/9505.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
acl: use the presence of a management policy in the state store as a sign that we already migrated to v2 acls
```
10 changes: 10 additions & 0 deletions agent/consul/acl_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ func (s *Server) canUpgradeToNewACLs(isLeader bool) bool {
return false
}

// Check to see if we already upgraded the last time we ran by seeing if we
// have a copy of any global management policy stored locally. This should
// always be true because policies always replicate.
_, mgmtPolicy, err := s.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
if err != nil {
s.logger.Warn("Failed to get the builtin global-management policy to check for a completed ACL upgrade; skipping this optimization", "error", err)
} else if mgmtPolicy != nil {
return true
}

if !s.InACLDatacenter() {
foundServers, mode, _ := ServersGetACLMode(s, "", s.config.ACLDatacenter)
if mode != structs.ACLModeEnabled || !foundServers {
Expand Down
94 changes: 94 additions & 0 deletions agent/consul/leader_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

"github.com/hashicorp/consul/agent/structs"
tokenStore "github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/sdk/testutil"
"github.com/hashicorp/consul/sdk/testutil/retry"
Expand Down Expand Up @@ -1272,6 +1273,99 @@ func TestLeader_ACLUpgrade(t *testing.T) {
})
}

func TestLeader_ACLUpgrade_IsStickyEvenIfSerfTagsRegress(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}

t.Parallel()

// We test this by having two datacenters with one server each. They
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This scenario is actually the thing I was running into while trying to fix #9342. Without this fix, the patches on the other PR to prevent dialing the primary's gateway when the local is known to have a gateway were blocked waiting for the mesh gateway envoy to start in the secondary. That in turn was blocked on the "wtf? i can't handle non-legacy tokens yet until i talk to the primary" circular dependency.

// initially come up and complete the migration, then we power them both
// off. We leave the primary off permanently, and then we stand up the
// secondary. Hopefully it should transition to ENABLED instead of being
// stuck in LEGACY.

dir1, s1 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc1"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLMasterToken = "root"
})
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()

waitForLeaderEstablishment(t, s1)

dir2, s2 := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc2"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLTokenReplication = false
c.ACLReplicationRate = 100
c.ACLReplicationBurst = 100
c.ACLReplicationApplyLimit = 1000000
})
defer os.RemoveAll(dir2)
defer s2.Shutdown()
codec2 := rpcClient(t, s2)
defer codec2.Close()

s2.tokens.UpdateReplicationToken("root", tokenStore.TokenSourceConfig)

testrpc.WaitForLeader(t, s2.RPC, "dc2")
waitForLeaderEstablishment(t, s2)

// Create the WAN link
joinWAN(t, s2, s1)
waitForLeaderEstablishment(t, s1)
waitForLeaderEstablishment(t, s2)

waitForNewACLs(t, s1)
waitForNewACLs(t, s2)
waitForNewACLReplication(t, s2, structs.ACLReplicatePolicies, 1, 0, 0)

// Everybody has the management policy.
retry.Run(t, func(r *retry.R) {
_, policy1, err := s1.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
require.NoError(r, err)
require.NotNil(r, policy1)

_, policy2, err := s2.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
require.NoError(r, err)
require.NotNil(r, policy2)
})

// Shutdown s1 and s2.
s1.Shutdown()
s2.Shutdown()

// Restart just s2

dir2new, s2new := testServerWithConfig(t, func(c *Config) {
c.Datacenter = "dc2"
c.ACLDatacenter = "dc1"
c.ACLsEnabled = true
c.ACLTokenReplication = false
c.ACLReplicationRate = 100
c.ACLReplicationBurst = 100
c.ACLReplicationApplyLimit = 1000000

c.DataDir = s2.config.DataDir
c.NodeName = s2.config.NodeName
c.NodeID = s2.config.NodeID
})
defer os.RemoveAll(dir2new)
defer s2new.Shutdown()

waitForLeaderEstablishment(t, s2new)

// It should be able to transition without connectivity to the primary.
waitForNewACLs(t, s2new)
}

func TestLeader_ConfigEntryBootstrap(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
Expand Down