Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport of Vault CA bugfixes into release/1.17.x #19309

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changelog/19285.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
```release-note:bug
ca: Fix bug with Vault CA provider where token renewal goroutines could leak if CA failed to initialize.
```

```release-note:bug
ca: Fix bug with Vault CA provider where renewing a retracted token would cause retries in a tight loop, degrading performance.
```
29 changes: 28 additions & 1 deletion agent/connect/ca/provider_vault.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/consul/lib/decode"
"github.com/hashicorp/consul/lib/retry"
)

const (
Expand Down Expand Up @@ -177,11 +178,17 @@ func (v *VaultProvider) Configure(cfg ProviderConfig) error {
v.stopWatcher()
}
v.stopWatcher = cancel
// NOTE: Any codepaths after v.renewToken(...) which return an error
// _must_ call v.stopWatcher() to prevent the renewal goroutine from
// leaking when the CA initialization fails and gets retried later.
go v.renewToken(ctx, lifetimeWatcher)
}

// Update the intermediate (managed) PKI mount and role
if err := v.setupIntermediatePKIPath(); err != nil {
if v.stopWatcher != nil {
v.stopWatcher()
}
return err
}

Expand Down Expand Up @@ -223,6 +230,16 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
go watcher.Start()
defer watcher.Stop()

// These values are chosen to start the exponential backoff
// immediately. Since the Vault client implements its own
// retries, this retry is mostly to avoid resource contention
// and log spam.
retrier := retry.Waiter{
MinFailures: 1,
MinWait: 1 * time.Second,
Jitter: retry.NewJitter(20),
}

for {
select {
case <-ctx.Done():
Expand All @@ -231,7 +248,16 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
case err := <-watcher.DoneCh():
// Watcher has stopped
if err != nil {
v.logger.Error("Error renewing token for Vault provider", "error", err)
v.logger.Error("Error renewing token for Vault provider", "error", err, "retries", retrier.Failures())
}

// Although the vault watcher has its own retry logic, we have encountered
// issues when passing an invalid Vault token which would send an error to
// watcher.DoneCh() immediately, causing us to start the watcher over and
// over again in a very tight loop.
if err := retrier.Wait(ctx); err != nil {
// only possible error is when context is cancelled
return
}

// If the watcher has exited and auth method is enabled,
Expand Down Expand Up @@ -265,6 +291,7 @@ func (v *VaultProvider) renewToken(ctx context.Context, watcher *vaultapi.Lifeti
go watcher.Start()

case <-watcher.RenewCh():
retrier.Reset()
v.logger.Info("Successfully renewed token for Vault provider")
}
}
Expand Down
64 changes: 63 additions & 1 deletion agent/connect/ca/provider_vault_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"encoding/json"
"fmt"
"io"
"runtime/pprof"
"strconv"
"strings"
"sync/atomic"
Expand Down Expand Up @@ -237,8 +238,69 @@ func TestVaultCAProvider_Configure(t *testing.T) {
testcase.expectedValue(t, provider)
})
}
}

// This test must not run in parallel
func TestVaultCAProvider_ConfigureFailureGoroutineLeakCheck(t *testing.T) {
if testing.Short() {
t.Skip("too slow for testing.Short")
}
SkipIfVaultNotPresent(t)

testVault := NewTestVaultServer(t)

attr := &VaultTokenAttributes{
RootPath: "pki-root",
IntermediatePath: "pki-intermediate",
ConsulManaged: true,
}
token := CreateVaultTokenWithAttrs(t, testVault.client, attr)

provider := NewVaultProvider(hclog.New(&hclog.LoggerOptions{Name: "ca.vault"}))

t.Run("error on Configure does not leak renewal routine", func(t *testing.T) {
config := map[string]any{
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "badbadbad/",
}
cfg := vaultProviderConfig(t, testVault.Addr, token, config)

err := provider.Configure(cfg)
require.Error(t, err)

retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) {
profile := pprof.Lookup("goroutine")
sb := strings.Builder{}
require.NoError(r, profile.WriteTo(&sb, 2))
require.NotContains(r, sb.String(),
"created by github.com/hashicorp/consul/agent/connect/ca.(*VaultProvider).Configure",
"found renewal goroutine leak")
// If this test is failing because you added a new goroutine to
// (*VaultProvider).Configure AND that goroutine should persist
// even if Configure errored, then you should change the checked
// string to (*VaultProvider).renewToken.
})
})

return
t.Run("successful Configure starts renewal routine", func(t *testing.T) {
config := map[string]any{
"RootPKIPath": "pki-root/",
"IntermediatePKIPath": "pki-intermediate/",
}
cfg := vaultProviderConfig(t, testVault.Addr, token, config)

require.NoError(t, provider.Configure(cfg))

retry.RunWith(retry.TwoSeconds(), t, func(r *retry.R) {
profile := pprof.Lookup("goroutine")
sb := strings.Builder{}
require.NoError(r, profile.WriteTo(&sb, 2))
t.Log(sb.String())
require.Contains(r, sb.String(),
"created by github.com/hashicorp/consul/agent/connect/ca.(*VaultProvider).Configure",
"expected renewal goroutine, got none")
})
})
}

func TestVaultCAProvider_SecondaryActiveIntermediate(t *testing.T) {
Expand Down
Loading