diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bd710d7b1..23f5982de3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ IMPROVEMENTS: * CLI * Pre-check in the `install` command to verify the correct license secret exists when using an enterprise Consul image. [[GH-875](https://github.com/hashicorp/consul-k8s/pull/875)] +BUG FIXES: +* Control Plane: + * Add a workaround to check that the ACL token is replicated to other Consul servers. [[GH-862](https://github.com/hashicorp/consul-k8s/issues/862)] + ## 0.37.0 (November 18, 2021) BREAKING CHANGES: diff --git a/control-plane/subcommand/connect-init/command.go b/control-plane/subcommand/connect-init/command.go index e7487fb9e1..18271e6b05 100644 --- a/control-plane/subcommand/connect-init/command.go +++ b/control-plane/subcommand/connect-init/command.go @@ -26,6 +26,9 @@ const ( numLoginRetries = 3 // The number of times to attempt to read this service (120s). defaultServicePollingRetries = 120 + + raftReplicationTimeout = 2 * time.Second + tokenReadPollingInterval = 100 * time.Millisecond ) type Command struct { @@ -155,6 +158,49 @@ func (c *Command) Run(args []string) int { return 1 } c.logger.Info("Consul login complete") + + // A workaround to check that the ACL token is replicated to other Consul servers. + // + // A consul client may reach out to a follower instead of a leader to resolve the token during the + // call to get services below. This is because clients talk to servers in the stale consistency mode + // to decrease the load on the servers (see https://www.consul.io/docs/architecture/consensus#stale). + // In that case, it's possible that the token isn't replicated + // to that server instance yet. The client will then get an "ACL not found" error + // and subsequently cache this not found response. Then our call below + // to get services from the agent will keep hitting the same "ACL not found" error + // until the cache entry expires (determined by the `acl_token_ttl` which defaults to 30 seconds). + // This is not great because it will delay app start up time by 30 seconds in most cases + // (if you are running 3 servers, then the probability of ending up on a follower is close to 2/3). + // + // To help with that, we try to first read the token in the stale consistency mode until we + // get a successful response. This should not take more than 100ms because raft replication + // should in most cases take less than that (see https://www.consul.io/docs/install/performance#read-write-tuning) + // but we set the timeout to 2s to be sure. + // + // Note though that this workaround does not eliminate this problem completely. It's still possible + // for this call and the next call to reach different servers and those servers to have different + // states from each other. + // For example, this call can reach a leader and succeed, while the call below can go to a follower + // that is still behind the leader and get an "ACL not found" error. + // However, this is a pretty unlikely case because + // clients have sticky connections to a server, and those connections get rebalanced only every 2-3min. + // And so, this workaround should work in a vast majority of cases. + c.logger.Info("Checking that the ACL token exists when reading it in the stale consistency mode") + // Use raft timeout and polling interval to determine the number of retries. + numTokenReadRetries := uint64(raftReplicationTimeout.Milliseconds() / tokenReadPollingInterval.Milliseconds()) + err = backoff.Retry(func() error { + _, _, err := consulClient.ACL().TokenReadSelf(&api.QueryOptions{AllowStale: true}) + if err != nil { + c.logger.Error("Unable to read ACL token; retrying", "err", err) + } + return err + }, backoff.WithMaxRetries(backoff.NewConstantBackOff(tokenReadPollingInterval), numTokenReadRetries)) + if err != nil { + c.logger.Error("Unable to read ACL token from a Consul server; "+ + "please check that your server cluster is healthy", "err", err) + return 1 + } + c.logger.Info("Successfully read ACL token from the server") } // Now wait for the service to be registered. Do this by querying the Agent for a service diff --git a/control-plane/subcommand/connect-init/command_test.go b/control-plane/subcommand/connect-init/command_test.go index 704c7356a6..ff847bf0fb 100644 --- a/control-plane/subcommand/connect-init/command_test.go +++ b/control-plane/subcommand/connect-init/command_test.go @@ -592,6 +592,10 @@ func TestRun_FailsWithBadServerResponses(t *testing.T) { if r != nil && r.URL.Path == "/v1/acl/login" && r.Method == "POST" { w.Write([]byte(c.loginResponse)) } + // Token read request. + if r != nil && r.URL.Path == "/v1/acl/token/self" && r.Method == "GET" { + w.Write([]byte(testTokenReadSelfResponse)) + } // Agent Services get. if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" { servicesGetCounter++ @@ -600,7 +604,7 @@ func TestRun_FailsWithBadServerResponses(t *testing.T) { })) defer consulServer.Close() - // Setup the Command. + // Set up the Command. ui := cli.NewMockUi() cmd := Command{ UI: ui, @@ -663,6 +667,10 @@ func TestRun_LoginWithRetries(t *testing.T) { w.Write([]byte(testLoginResponse)) } } + // Token read request. + if r != nil && r.URL.Path == "/v1/acl/token/self" && r.Method == "GET" { + w.Write([]byte(testTokenReadSelfResponse)) + } // Agent Services get. if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" { w.Write([]byte(testServiceListResponse)) @@ -702,6 +710,79 @@ func TestRun_LoginWithRetries(t *testing.T) { } } +// Test that we check token exists when reading it in the stale consistency mode. +func TestRun_EnsureTokenExists(t *testing.T) { + t.Parallel() + + cases := map[string]struct { + neverSucceed bool + }{ + "succeed after first retry": {neverSucceed: false}, + "never succeed": {neverSucceed: true}, + } + + for name, c := range cases { + t.Run(name, func(t *testing.T) { + // Create a fake input bearer token file and an output file. + bearerFile := common.WriteTempFile(t, "bearerTokenFile") + tokenFile := common.WriteTempFile(t, "") + proxyFile := common.WriteTempFile(t, "") + + // Start the mock Consul server. + counter := 0 + consulServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // ACL Login. + if r != nil && r.URL.Path == "/v1/acl/login" && r.Method == "POST" { + w.Write([]byte(testLoginResponse)) + } + // Token read request. + if r != nil && + r.URL.Path == "/v1/acl/token/self" && + r.Method == "GET" && + r.URL.Query().Has("stale") { + + // Fail the first request but succeed on the next. + if counter == 0 || c.neverSucceed { + counter++ + w.WriteHeader(http.StatusForbidden) + w.Write([]byte("ACL not found")) + } else { + w.Write([]byte(testTokenReadSelfResponse)) + } + } + // Agent Services get. + if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" { + w.Write([]byte(testServiceListResponse)) + } + })) + defer consulServer.Close() + + serverURL, err := url.Parse(consulServer.URL) + require.NoError(t, err) + + ui := cli.NewMockUi() + cmd := Command{ + UI: ui, + tokenSinkFile: tokenFile, + bearerTokenFile: bearerFile, + proxyIDFile: proxyFile, + } + code := cmd.Run([]string{ + "-pod-name", testPodName, + "-pod-namespace", testPodNamespace, + "-acl-auth-method", test.AuthMethod, + "-service-account-name", testServiceAccountName, + "-http-addr", serverURL.String()}) + if c.neverSucceed { + require.Equal(t, 1, code) + } else { + require.Equal(t, 0, code) + require.Equal(t, 1, counter) + } + }) + } +} + const ( metaKeyPodName = "pod-name" metaKeyKubeNS = "k8s-namespace" @@ -710,7 +791,7 @@ const ( testPodName = "counting-pod" testServiceAccountName = "counting" - // sample response from https://consul.io/api-docs/acl#sample-response + // Sample response from https://consul.io/api-docs/acl#sample-response. testLoginResponse = `{ "AccessorID": "926e2bd2-b344-d91b-0c83-ae89f372cd9b", "SecretID": "b78d37c7-0ca7-5f4d-99ee-6d9975ce4586", @@ -734,6 +815,30 @@ const ( "ModifyIndex": 36 }` + // Sample response from https://www.consul.io/api-docs/acl/tokens#read-self-token. + testTokenReadSelfResponse = ` +{ + "AccessorID": "6a1253d2-1785-24fd-91c2-f8e78c745511", + "SecretID": "45a3bd52-07c7-47a4-52fd-0745e0cfe967", + "Description": "Agent token for 'node1'", + "Policies": [ + { + "ID": "165d4317-e379-f732-ce70-86278c4558f7", + "Name": "node1-write" + }, + { + "ID": "e359bd81-baca-903e-7e64-1ccd9fdc78f5", + "Name": "node-read" + } + ], + "Local": false, + "CreateTime": "2018-10-24T12:25:06.921933-04:00", + "Hash": "UuiRkOQPRCvoRZHRtUxxbrmwZ5crYrOdZ0Z1FTFbTbA=", + "CreateIndex": 59, + "ModifyIndex": 59 +} +` + testServiceListResponse = `{ "counting-counting": { "ID": "counting-counting",