From a60e6f6e8f207d7ff24624825612724024df21ba Mon Sep 17 00:00:00 2001 From: Iryna Shustava Date: Tue, 30 Nov 2021 21:47:44 -0700 Subject: [PATCH] Add a workaround to check that the ACL token is replicated to other Consul servers Fixes #862 A consul client may reach out to a follower instead of a leader to resolve the token during the call to get services below. This is because clients talk to servers in the stale consistency mode to decrease the load on the servers (see https://www.consul.io/docs/architecture/consensus#stale). In that case, it's possible that the token isn't replicated to that server instance yet. The client will then get an "ACL not found" error and subsequently cache this not found response. Then our call below to get services from the agent will keep hitting the same "ACL not found" error until the cache entry expires (determined by the `acl_token_ttl` which defaults to 30 seconds). This is not great because it will delay app start up time by 30 seconds in most cases (if you are running 3 servers, then the probability of ending up on a follower is close to 2/3). To help with that, we try to first read the token in the stale consistency mode until we get a successful response. This should not take more than 100ms because raft replication should in most cases take less than that (see https://www.consul.io/docs/install/performance#read-write-tuning) but we set the timeout to 2s to be sure. Note though that this workaround does not eliminate this problem completely. It's still possible for this call and the next call to reach different servers and those servers to have different states from each other. For example, this call can reach a leader and succeed, while the call below can go to a follower that is still behind the leader and get an "ACL not found" error. However, this is a pretty unlikely case because clients have sticky connections to a server, and those connections get rebalanced only every 2-3min. And so, this workaround should work in a vast majority of cases. --- CHANGELOG.md | 4 + .../subcommand/connect-init/command.go | 46 ++++++++ .../subcommand/connect-init/command_test.go | 109 +++++++++++++++++- 3 files changed, 157 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bd710d7b1..23f5982de3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ IMPROVEMENTS: * CLI * Pre-check in the `install` command to verify the correct license secret exists when using an enterprise Consul image. [[GH-875](https://github.com/hashicorp/consul-k8s/pull/875)] +BUG FIXES: +* Control Plane: + * Add a workaround to check that the ACL token is replicated to other Consul servers. [[GH-862](https://github.com/hashicorp/consul-k8s/issues/862)] + ## 0.37.0 (November 18, 2021) BREAKING CHANGES: diff --git a/control-plane/subcommand/connect-init/command.go b/control-plane/subcommand/connect-init/command.go index e7487fb9e1..18271e6b05 100644 --- a/control-plane/subcommand/connect-init/command.go +++ b/control-plane/subcommand/connect-init/command.go @@ -26,6 +26,9 @@ const ( numLoginRetries = 3 // The number of times to attempt to read this service (120s). defaultServicePollingRetries = 120 + + raftReplicationTimeout = 2 * time.Second + tokenReadPollingInterval = 100 * time.Millisecond ) type Command struct { @@ -155,6 +158,49 @@ func (c *Command) Run(args []string) int { return 1 } c.logger.Info("Consul login complete") + + // A workaround to check that the ACL token is replicated to other Consul servers. + // + // A consul client may reach out to a follower instead of a leader to resolve the token during the + // call to get services below. This is because clients talk to servers in the stale consistency mode + // to decrease the load on the servers (see https://www.consul.io/docs/architecture/consensus#stale). + // In that case, it's possible that the token isn't replicated + // to that server instance yet. The client will then get an "ACL not found" error + // and subsequently cache this not found response. Then our call below + // to get services from the agent will keep hitting the same "ACL not found" error + // until the cache entry expires (determined by the `acl_token_ttl` which defaults to 30 seconds). + // This is not great because it will delay app start up time by 30 seconds in most cases + // (if you are running 3 servers, then the probability of ending up on a follower is close to 2/3). + // + // To help with that, we try to first read the token in the stale consistency mode until we + // get a successful response. This should not take more than 100ms because raft replication + // should in most cases take less than that (see https://www.consul.io/docs/install/performance#read-write-tuning) + // but we set the timeout to 2s to be sure. + // + // Note though that this workaround does not eliminate this problem completely. It's still possible + // for this call and the next call to reach different servers and those servers to have different + // states from each other. + // For example, this call can reach a leader and succeed, while the call below can go to a follower + // that is still behind the leader and get an "ACL not found" error. + // However, this is a pretty unlikely case because + // clients have sticky connections to a server, and those connections get rebalanced only every 2-3min. + // And so, this workaround should work in a vast majority of cases. + c.logger.Info("Checking that the ACL token exists when reading it in the stale consistency mode") + // Use raft timeout and polling interval to determine the number of retries. + numTokenReadRetries := uint64(raftReplicationTimeout.Milliseconds() / tokenReadPollingInterval.Milliseconds()) + err = backoff.Retry(func() error { + _, _, err := consulClient.ACL().TokenReadSelf(&api.QueryOptions{AllowStale: true}) + if err != nil { + c.logger.Error("Unable to read ACL token; retrying", "err", err) + } + return err + }, backoff.WithMaxRetries(backoff.NewConstantBackOff(tokenReadPollingInterval), numTokenReadRetries)) + if err != nil { + c.logger.Error("Unable to read ACL token from a Consul server; "+ + "please check that your server cluster is healthy", "err", err) + return 1 + } + c.logger.Info("Successfully read ACL token from the server") } // Now wait for the service to be registered. Do this by querying the Agent for a service diff --git a/control-plane/subcommand/connect-init/command_test.go b/control-plane/subcommand/connect-init/command_test.go index 704c7356a6..ff847bf0fb 100644 --- a/control-plane/subcommand/connect-init/command_test.go +++ b/control-plane/subcommand/connect-init/command_test.go @@ -592,6 +592,10 @@ func TestRun_FailsWithBadServerResponses(t *testing.T) { if r != nil && r.URL.Path == "/v1/acl/login" && r.Method == "POST" { w.Write([]byte(c.loginResponse)) } + // Token read request. + if r != nil && r.URL.Path == "/v1/acl/token/self" && r.Method == "GET" { + w.Write([]byte(testTokenReadSelfResponse)) + } // Agent Services get. if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" { servicesGetCounter++ @@ -600,7 +604,7 @@ func TestRun_FailsWithBadServerResponses(t *testing.T) { })) defer consulServer.Close() - // Setup the Command. + // Set up the Command. ui := cli.NewMockUi() cmd := Command{ UI: ui, @@ -663,6 +667,10 @@ func TestRun_LoginWithRetries(t *testing.T) { w.Write([]byte(testLoginResponse)) } } + // Token read request. + if r != nil && r.URL.Path == "/v1/acl/token/self" && r.Method == "GET" { + w.Write([]byte(testTokenReadSelfResponse)) + } // Agent Services get. if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" { w.Write([]byte(testServiceListResponse)) @@ -702,6 +710,79 @@ func TestRun_LoginWithRetries(t *testing.T) { } } +// Test that we check token exists when reading it in the stale consistency mode. +func TestRun_EnsureTokenExists(t *testing.T) { + t.Parallel() + + cases := map[string]struct { + neverSucceed bool + }{ + "succeed after first retry": {neverSucceed: false}, + "never succeed": {neverSucceed: true}, + } + + for name, c := range cases { + t.Run(name, func(t *testing.T) { + // Create a fake input bearer token file and an output file. + bearerFile := common.WriteTempFile(t, "bearerTokenFile") + tokenFile := common.WriteTempFile(t, "") + proxyFile := common.WriteTempFile(t, "") + + // Start the mock Consul server. + counter := 0 + consulServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // ACL Login. + if r != nil && r.URL.Path == "/v1/acl/login" && r.Method == "POST" { + w.Write([]byte(testLoginResponse)) + } + // Token read request. + if r != nil && + r.URL.Path == "/v1/acl/token/self" && + r.Method == "GET" && + r.URL.Query().Has("stale") { + + // Fail the first request but succeed on the next. + if counter == 0 || c.neverSucceed { + counter++ + w.WriteHeader(http.StatusForbidden) + w.Write([]byte("ACL not found")) + } else { + w.Write([]byte(testTokenReadSelfResponse)) + } + } + // Agent Services get. + if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" { + w.Write([]byte(testServiceListResponse)) + } + })) + defer consulServer.Close() + + serverURL, err := url.Parse(consulServer.URL) + require.NoError(t, err) + + ui := cli.NewMockUi() + cmd := Command{ + UI: ui, + tokenSinkFile: tokenFile, + bearerTokenFile: bearerFile, + proxyIDFile: proxyFile, + } + code := cmd.Run([]string{ + "-pod-name", testPodName, + "-pod-namespace", testPodNamespace, + "-acl-auth-method", test.AuthMethod, + "-service-account-name", testServiceAccountName, + "-http-addr", serverURL.String()}) + if c.neverSucceed { + require.Equal(t, 1, code) + } else { + require.Equal(t, 0, code) + require.Equal(t, 1, counter) + } + }) + } +} + const ( metaKeyPodName = "pod-name" metaKeyKubeNS = "k8s-namespace" @@ -710,7 +791,7 @@ const ( testPodName = "counting-pod" testServiceAccountName = "counting" - // sample response from https://consul.io/api-docs/acl#sample-response + // Sample response from https://consul.io/api-docs/acl#sample-response. testLoginResponse = `{ "AccessorID": "926e2bd2-b344-d91b-0c83-ae89f372cd9b", "SecretID": "b78d37c7-0ca7-5f4d-99ee-6d9975ce4586", @@ -734,6 +815,30 @@ const ( "ModifyIndex": 36 }` + // Sample response from https://www.consul.io/api-docs/acl/tokens#read-self-token. + testTokenReadSelfResponse = ` +{ + "AccessorID": "6a1253d2-1785-24fd-91c2-f8e78c745511", + "SecretID": "45a3bd52-07c7-47a4-52fd-0745e0cfe967", + "Description": "Agent token for 'node1'", + "Policies": [ + { + "ID": "165d4317-e379-f732-ce70-86278c4558f7", + "Name": "node1-write" + }, + { + "ID": "e359bd81-baca-903e-7e64-1ccd9fdc78f5", + "Name": "node-read" + } + ], + "Local": false, + "CreateTime": "2018-10-24T12:25:06.921933-04:00", + "Hash": "UuiRkOQPRCvoRZHRtUxxbrmwZ5crYrOdZ0Z1FTFbTbA=", + "CreateIndex": 59, + "ModifyIndex": 59 +} +` + testServiceListResponse = `{ "counting-counting": { "ID": "counting-counting",