From d146e9491f878407883b269152b7c1aeb00ccee3 Mon Sep 17 00:00:00 2001
From: "R.B. Boyer" <rb@hashicorp.com>
Date: Mon, 4 Jan 2021 17:24:27 -0600
Subject: [PATCH 1/3] acl: use the presence of a management policy in the state
 store as a sign that we already migrated to v2 acls

This way we only have to wait for the serf barrier to pass once before
we can upgrade to v2 acls. Without this patch every restart needs to
re-compute the change, and potentially if a stray older node joins after
a migration it might regress back to v1 mode which would be problematic.
---
 agent/consul/acl_server.go  | 10 ++++
 agent/consul/leader_test.go | 94 +++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/agent/consul/acl_server.go b/agent/consul/acl_server.go
index a495a312b520..56627a4874f5 100644
--- a/agent/consul/acl_server.go
+++ b/agent/consul/acl_server.go
@@ -108,6 +108,16 @@ func (s *Server) canUpgradeToNewACLs(isLeader bool) bool {
 		return false
 	}
 
+	// Check to see if we already upgraded the last time we ran by seeing if we
+	// have a copy of any global management policy stored locally. This should
+	// always be true because policies always replicate.
+	_, mgmtPolicy, err := s.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
+	if err != nil {
+		s.logger.Warn("Failed to get the builtin global-management policy to check for a completed ACL upgrade; skipping this optimization", "error", err)
+	} else if mgmtPolicy != nil {
+		return true
+	}
+
 	if !s.InACLDatacenter() {
 		foundServers, mode, _ := ServersGetACLMode(s, "", s.config.ACLDatacenter)
 		if mode != structs.ACLModeEnabled || !foundServers {
diff --git a/agent/consul/leader_test.go b/agent/consul/leader_test.go
index dfe9ffe39590..36cf8c7f15e0 100644
--- a/agent/consul/leader_test.go
+++ b/agent/consul/leader_test.go
@@ -10,6 +10,7 @@ import (
 	"time"
 
 	"github.com/hashicorp/consul/agent/structs"
+	tokenStore "github.com/hashicorp/consul/agent/token"
 	"github.com/hashicorp/consul/api"
 	"github.com/hashicorp/consul/sdk/testutil"
 	"github.com/hashicorp/consul/sdk/testutil/retry"
@@ -1272,6 +1273,99 @@ func TestLeader_ACLUpgrade(t *testing.T) {
 	})
 }
 
+func TestLeader_ACLUpgrade_IsStickyEvenIfSerfTagsRegress(t *testing.T) {
+	if testing.Short() {
+		t.Skip("too slow for testing.Short")
+	}
+
+	// t.Parallel()
+
+	// We test this by having two datacenters with one server each. They
+	// initially come up and complete the migration, then we power them both
+	// off. We leave the primary off permanently, and then we stand up the
+	// secondary. Hopefully it should transition to ENABLED instead of being
+	// stuck in LEGACY.
+
+	dir1, s1 := testServerWithConfig(t, func(c *Config) {
+		c.Datacenter = "dc1"
+		c.ACLDatacenter = "dc1"
+		c.ACLsEnabled = true
+		c.ACLMasterToken = "root"
+	})
+	defer os.RemoveAll(dir1)
+	defer s1.Shutdown()
+	codec := rpcClient(t, s1)
+	defer codec.Close()
+
+	waitForLeaderEstablishment(t, s1)
+
+	dir2, s2 := testServerWithConfig(t, func(c *Config) {
+		c.Datacenter = "dc2"
+		c.ACLDatacenter = "dc1"
+		c.ACLsEnabled = true
+		c.ACLTokenReplication = false
+		c.ACLReplicationRate = 100
+		c.ACLReplicationBurst = 100
+		c.ACLReplicationApplyLimit = 1000000
+	})
+	defer os.RemoveAll(dir2)
+	defer s2.Shutdown()
+	codec2 := rpcClient(t, s2)
+	defer codec2.Close()
+
+	s2.tokens.UpdateReplicationToken("root", tokenStore.TokenSourceConfig)
+
+	testrpc.WaitForLeader(t, s2.RPC, "dc2")
+	waitForLeaderEstablishment(t, s2)
+
+	// Create the WAN link
+	joinWAN(t, s2, s1)
+	waitForLeaderEstablishment(t, s1)
+	waitForLeaderEstablishment(t, s2)
+
+	waitForNewACLs(t, s1)
+	waitForNewACLs(t, s2)
+	waitForNewACLReplication(t, s2, structs.ACLReplicatePolicies, 1, 0, 0)
+
+	// Everybody has the management policy.
+	retry.Run(t, func(r *retry.R) {
+		_, policy1, err := s1.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
+		require.NoError(r, err)
+		require.NotNil(r, policy1)
+
+		_, policy2, err := s2.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID, structs.DefaultEnterpriseMeta())
+		require.NoError(r, err)
+		require.NotNil(r, policy2)
+	})
+
+	// Shutdown s1 and s2.
+	s1.Shutdown()
+	s2.Shutdown()
+
+	// Restart just s2
+
+	dir2new, s2new := testServerWithConfig(t, func(c *Config) {
+		c.Datacenter = "dc2"
+		c.ACLDatacenter = "dc1"
+		c.ACLsEnabled = true
+		c.ACLTokenReplication = false
+		c.ACLReplicationRate = 100
+		c.ACLReplicationBurst = 100
+		c.ACLReplicationApplyLimit = 1000000
+
+		c.DataDir = s2.config.DataDir
+		c.NodeName = s2.config.NodeName
+		c.NodeID = s2.config.NodeID
+	})
+	defer os.RemoveAll(dir2new)
+	defer s2new.Shutdown()
+
+	waitForLeaderEstablishment(t, s2new)
+
+	// It should be able to transition without connectivity to the primary.
+	waitForNewACLs(t, s2new)
+}
+
 func TestLeader_ConfigEntryBootstrap(t *testing.T) {
 	if testing.Short() {
 		t.Skip("too slow for testing.Short")

From ab5b412c3d414d82da87427a5e745ca273051bdf Mon Sep 17 00:00:00 2001
From: "R.B. Boyer" <rb@hashicorp.com>
Date: Tue, 5 Jan 2021 16:41:43 -0600
Subject: [PATCH 2/3] changelog

---
 .changelog/9505.txt | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .changelog/9505.txt

diff --git a/.changelog/9505.txt b/.changelog/9505.txt
new file mode 100644
index 000000000000..51320af7bd1c
--- /dev/null
+++ b/.changelog/9505.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+acl: use the presence of a management policy in the state store as a sign that we already migrated to v2 acls
+```

From e94b0cea5140b85e257ab7d3b0bcafc62d0de293 Mon Sep 17 00:00:00 2001
From: "R.B. Boyer" <rb@hashicorp.com>
Date: Tue, 5 Jan 2021 16:43:08 -0600
Subject: [PATCH 3/3] remove comment

---
 agent/consul/leader_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/agent/consul/leader_test.go b/agent/consul/leader_test.go
index 36cf8c7f15e0..08466c840ffc 100644
--- a/agent/consul/leader_test.go
+++ b/agent/consul/leader_test.go
@@ -1278,7 +1278,7 @@ func TestLeader_ACLUpgrade_IsStickyEvenIfSerfTagsRegress(t *testing.T) {
 		t.Skip("too slow for testing.Short")
 	}
 
-	// t.Parallel()
+	t.Parallel()
 
 	// We test this by having two datacenters with one server each. They
 	// initially come up and complete the migration, then we power them both