Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a workaround to check that the ACL token is replicated to other Consul servers #887

Merged
merged 1 commit into from
Dec 1, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ IMPROVEMENTS:
* CLI
* Pre-check in the `install` command to verify the correct license secret exists when using an enterprise Consul image. [[GH-875](https://github.com/hashicorp/consul-k8s/pull/875)]

BUG FIXES:
* Control Plane:
* Add a workaround to check that the ACL token is replicated to other Consul servers. [[GH-862](https://github.com/hashicorp/consul-k8s/issues/862)]

## 0.37.0 (November 18, 2021)

BREAKING CHANGES:
Expand Down
46 changes: 46 additions & 0 deletions control-plane/subcommand/connect-init/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ const (
numLoginRetries = 3
// The number of times to attempt to read this service (120s).
defaultServicePollingRetries = 120

raftReplicationTimeout = 2 * time.Second
tokenReadPollingInterval = 100 * time.Millisecond
)

type Command struct {
Expand Down Expand Up @@ -155,6 +158,49 @@ func (c *Command) Run(args []string) int {
return 1
}
c.logger.Info("Consul login complete")

// A workaround to check that the ACL token is replicated to other Consul servers.
//
// A consul client may reach out to a follower instead of a leader to resolve the token during the
// call to get services below. This is because clients talk to servers in the stale consistency mode
// to decrease the load on the servers (see https://www.consul.io/docs/architecture/consensus#stale).
// In that case, it's possible that the token isn't replicated
// to that server instance yet. The client will then get an "ACL not found" error
// and subsequently cache this not found response. Then our call below
// to get services from the agent will keep hitting the same "ACL not found" error
// until the cache entry expires (determined by the `acl_token_ttl` which defaults to 30 seconds).
// This is not great because it will delay app start up time by 30 seconds in most cases
// (if you are running 3 servers, then the probability of ending up on a follower is close to 2/3).
//
// To help with that, we try to first read the token in the stale consistency mode until we
// get a successful response. This should not take more than 100ms because raft replication
// should in most cases take less than that (see https://www.consul.io/docs/install/performance#read-write-tuning)
// but we set the timeout to 2s to be sure.
//
// Note though that this workaround does not eliminate this problem completely. It's still possible
// for this call and the next call to reach different servers and those servers to have different
// states from each other.
// For example, this call can reach a leader and succeed, while the call below can go to a follower
// that is still behind the leader and get an "ACL not found" error.
// However, this is a pretty unlikely case because
// clients have sticky connections to a server, and those connections get rebalanced only every 2-3min.
// And so, this workaround should work in a vast majority of cases.
c.logger.Info("Checking that the ACL token exists when reading it in the stale consistency mode")
// Use raft timeout and polling interval to determine the number of retries.
numTokenReadRetries := uint64(raftReplicationTimeout.Milliseconds() / tokenReadPollingInterval.Milliseconds())
err = backoff.Retry(func() error {
_, _, err := consulClient.ACL().TokenReadSelf(&api.QueryOptions{AllowStale: true})
if err != nil {
c.logger.Error("Unable to read ACL token; retrying", "err", err)
}
return err
}, backoff.WithMaxRetries(backoff.NewConstantBackOff(tokenReadPollingInterval), numTokenReadRetries))
if err != nil {
c.logger.Error("Unable to read ACL token from a Consul server; "+
"please check that your server cluster is healthy", "err", err)
return 1
}
c.logger.Info("Successfully read ACL token from the server")
}

// Now wait for the service to be registered. Do this by querying the Agent for a service
Expand Down
109 changes: 107 additions & 2 deletions control-plane/subcommand/connect-init/command_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,10 @@ func TestRun_FailsWithBadServerResponses(t *testing.T) {
if r != nil && r.URL.Path == "/v1/acl/login" && r.Method == "POST" {
w.Write([]byte(c.loginResponse))
}
// Token read request.
if r != nil && r.URL.Path == "/v1/acl/token/self" && r.Method == "GET" {
w.Write([]byte(testTokenReadSelfResponse))
}
// Agent Services get.
if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" {
servicesGetCounter++
Expand All @@ -600,7 +604,7 @@ func TestRun_FailsWithBadServerResponses(t *testing.T) {
}))
defer consulServer.Close()

// Setup the Command.
// Set up the Command.
ui := cli.NewMockUi()
cmd := Command{
UI: ui,
Expand Down Expand Up @@ -663,6 +667,10 @@ func TestRun_LoginWithRetries(t *testing.T) {
w.Write([]byte(testLoginResponse))
}
}
// Token read request.
if r != nil && r.URL.Path == "/v1/acl/token/self" && r.Method == "GET" {
w.Write([]byte(testTokenReadSelfResponse))
}
// Agent Services get.
if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" {
w.Write([]byte(testServiceListResponse))
Expand Down Expand Up @@ -702,6 +710,79 @@ func TestRun_LoginWithRetries(t *testing.T) {
}
}

// Test that we check token exists when reading it in the stale consistency mode.
func TestRun_EnsureTokenExists(t *testing.T) {
t.Parallel()

cases := map[string]struct {
neverSucceed bool
}{
"succeed after first retry": {neverSucceed: false},
"never succeed": {neverSucceed: true},
}

for name, c := range cases {
t.Run(name, func(t *testing.T) {
// Create a fake input bearer token file and an output file.
bearerFile := common.WriteTempFile(t, "bearerTokenFile")
tokenFile := common.WriteTempFile(t, "")
proxyFile := common.WriteTempFile(t, "")

// Start the mock Consul server.
counter := 0
consulServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// ACL Login.
if r != nil && r.URL.Path == "/v1/acl/login" && r.Method == "POST" {
w.Write([]byte(testLoginResponse))
}
// Token read request.
if r != nil &&
r.URL.Path == "/v1/acl/token/self" &&
r.Method == "GET" &&
r.URL.Query().Has("stale") {

// Fail the first request but succeed on the next.
if counter == 0 || c.neverSucceed {
counter++
w.WriteHeader(http.StatusForbidden)
w.Write([]byte("ACL not found"))
} else {
w.Write([]byte(testTokenReadSelfResponse))
}
}
// Agent Services get.
if r != nil && r.URL.Path == "/v1/agent/services" && r.Method == "GET" {
w.Write([]byte(testServiceListResponse))
}
}))
defer consulServer.Close()

serverURL, err := url.Parse(consulServer.URL)
require.NoError(t, err)

ui := cli.NewMockUi()
cmd := Command{
UI: ui,
tokenSinkFile: tokenFile,
bearerTokenFile: bearerFile,
proxyIDFile: proxyFile,
}
code := cmd.Run([]string{
"-pod-name", testPodName,
"-pod-namespace", testPodNamespace,
"-acl-auth-method", test.AuthMethod,
"-service-account-name", testServiceAccountName,
"-http-addr", serverURL.String()})
if c.neverSucceed {
require.Equal(t, 1, code)
} else {
require.Equal(t, 0, code)
require.Equal(t, 1, counter)
}
})
}
}

const (
metaKeyPodName = "pod-name"
metaKeyKubeNS = "k8s-namespace"
Expand All @@ -710,7 +791,7 @@ const (
testPodName = "counting-pod"
testServiceAccountName = "counting"

// sample response from https://consul.io/api-docs/acl#sample-response
// Sample response from https://consul.io/api-docs/acl#sample-response.
testLoginResponse = `{
"AccessorID": "926e2bd2-b344-d91b-0c83-ae89f372cd9b",
"SecretID": "b78d37c7-0ca7-5f4d-99ee-6d9975ce4586",
Expand All @@ -734,6 +815,30 @@ const (
"ModifyIndex": 36
}`

// Sample response from https://www.consul.io/api-docs/acl/tokens#read-self-token.
testTokenReadSelfResponse = `
{
"AccessorID": "6a1253d2-1785-24fd-91c2-f8e78c745511",
"SecretID": "45a3bd52-07c7-47a4-52fd-0745e0cfe967",
"Description": "Agent token for 'node1'",
"Policies": [
{
"ID": "165d4317-e379-f732-ce70-86278c4558f7",
"Name": "node1-write"
},
{
"ID": "e359bd81-baca-903e-7e64-1ccd9fdc78f5",
"Name": "node-read"
}
],
"Local": false,
"CreateTime": "2018-10-24T12:25:06.921933-04:00",
"Hash": "UuiRkOQPRCvoRZHRtUxxbrmwZ5crYrOdZ0Z1FTFbTbA=",
"CreateIndex": 59,
"ModifyIndex": 59
}
`

testServiceListResponse = `{
"counting-counting": {
"ID": "counting-counting",
Expand Down