From 465b095d353ca0c708f2b56a2bf26bb483ae9e68 Mon Sep 17 00:00:00 2001 From: "Chris S. Kim" Date: Fri, 20 Oct 2023 11:03:27 -0400 Subject: [PATCH] Vault CA bugfixes (#19285) * Re-add retry logic to Vault token renewal * Fix goroutine leak * Add test for detecting goroutine leak * Add changelog * Rename tests * Add comment --- .changelog/19285.txt | 7 +++ agent/connect/ca/provider_vault.go | 29 ++++++++++- agent/connect/ca/provider_vault_test.go | 66 ++++++++++++++++++++++++- 3 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 .changelog/19285.txt diff --git a/.changelog/19285.txt b/.changelog/19285.txt new file mode 100644 index 0000000000000..52320702edb32 --- /dev/null +++ b/.changelog/19285.txt @@ -0,0 +1,7 @@ +```release-note:bug +ca: Fix bug with Vault CA provider where token renewal goroutines could leak if CA failed to initialize. +``` + +```release-note:bug +ca: Fix bug with Vault CA provider where renewing a retracted token would cause retries in a tight loop, degrading performance. +``` diff --git a/agent/connect/ca/provider_vault.go b/agent/connect/ca/provider_vault.go index cafb350f940f2..8f3edd76efdda 100644 --- a/agent/connect/ca/provider_vault.go +++ b/agent/connect/ca/provider_vault.go @@ -20,6 +20,7 @@ import ( "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/lib" "github.com/hashicorp/consul/lib/decode" + "github.com/hashicorp/consul/lib/retry" ) const ( @@ -175,11 +176,17 @@ func (v *VaultProvider) Configure(cfg ProviderConfig) error { v.stopWatcher() } v.stopWatcher = cancel + // NOTE: Any codepaths after v.renewToken(...) which return an error + // _must_ call v.stopWatcher() to prevent the renewal goroutine from + // leaking when the CA initialization fails and gets retried later. go v.renewToken(ctx, lifetimeWatcher) } // Update the intermediate (managed) PKI mount and role if err := v.setupIntermediatePKIPath(); err != nil { + if v.stopWatcher != nil { + v.stopWatcher() + } return err } @@ -221,6 +228,16 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti go watcher.Start() defer watcher.Stop() + // These values are chosen to start the exponential backoff + // immediately. Since the Vault client implements its own + // retries, this retry is mostly to avoid resource contention + // and log spam. + retrier := retry.Waiter{ + MinFailures: 1, + MinWait: 1 * time.Second, + Jitter: retry.NewJitter(20), + } + for { select { case <-ctx.Done(): @@ -229,7 +246,16 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti case err := <-watcher.DoneCh(): // Watcher has stopped if err != nil { - v.logger.Error("Error renewing token for Vault provider", "error", err) + v.logger.Error("Error renewing token for Vault provider", "error", err, "retries", retrier.Failures()) + } + + // Although the vault watcher has its own retry logic, we have encountered + // issues when passing an invalid Vault token which would send an error to + // watcher.DoneCh() immediately, causing us to start the watcher over and + // over again in a very tight loop. + if err := retrier.Wait(ctx); err != nil { + // only possible error is when context is cancelled + return } // If the watcher has exited and auth method is enabled, @@ -263,6 +289,7 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti go watcher.Start() case <-watcher.RenewCh(): + retrier.Reset() v.logger.Info("Successfully renewed token for Vault provider") } } diff --git a/agent/connect/ca/provider_vault_test.go b/agent/connect/ca/provider_vault_test.go index 67cefdcac1163..ccb3a0da17647 100644 --- a/agent/connect/ca/provider_vault_test.go +++ b/agent/connect/ca/provider_vault_test.go @@ -5,6 +5,9 @@ import ( "encoding/json" "fmt" "io" + "runtime/pprof" + "strconv" + "strings" "sync/atomic" "testing" "time" @@ -220,8 +223,69 @@ func TestVaultCAProvider_Configure(t *testing.T) { testcase.expectedValue(t, provider) }) } +} + +// This test must not run in parallel +func TestVaultCAProvider_ConfigureFailureGoroutineLeakCheck(t *testing.T) { + if testing.Short() { + t.Skip("too slow for testing.Short") + } + SkipIfVaultNotPresent(t) + + testVault := NewTestVaultServer(t) + + attr := &VaultTokenAttributes{ + RootPath: "pki-root", + IntermediatePath: "pki-intermediate", + ConsulManaged: true, + } + token := CreateVaultTokenWithAttrs(t, testVault.client, attr) + + provider := NewVaultProvider(hclog.New(&hclog.LoggerOptions{Name: "ca.vault"})) + + t.Run("error on Configure does not leak renewal routine", func(t *testing.T) { + config := map[string]any{ + "RootPKIPath": "pki-root/", + "IntermediatePKIPath": "badbadbad/", + } + cfg := vaultProviderConfig(t, testVault.Addr, token, config) + + err := provider.Configure(cfg) + require.Error(t, err) + + retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) { + profile := pprof.Lookup("goroutine") + sb := strings.Builder{} + require.NoError(r, profile.WriteTo(&sb, 2)) + require.NotContains(r, sb.String(), + "created by github.com/hashicorp/consul/agent/connect/ca.(*VaultProvider).Configure", + "found renewal goroutine leak") + // If this test is failing because you added a new goroutine to + // (*VaultProvider).Configure AND that goroutine should persist + // even if Configure errored, then you should change the checked + // string to (*VaultProvider).renewToken. + }) + }) - return + t.Run("successful Configure starts renewal routine", func(t *testing.T) { + config := map[string]any{ + "RootPKIPath": "pki-root/", + "IntermediatePKIPath": "pki-intermediate/", + } + cfg := vaultProviderConfig(t, testVault.Addr, token, config) + + require.NoError(t, provider.Configure(cfg)) + + retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) { + profile := pprof.Lookup("goroutine") + sb := strings.Builder{} + require.NoError(r, profile.WriteTo(&sb, 2)) + t.Log(sb.String()) + require.Contains(r, sb.String(), + "created by github.com/hashicorp/consul/agent/connect/ca.(*VaultProvider).Configure", + "expected renewal goroutine, got none") + }) + }) } func TestVaultCAProvider_SecondaryActiveIntermediate(t *testing.T) {