Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in leaf-cert cache type where multiple client tokens collide #4736

Merged
merged 2 commits into from
Oct 3, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions agent/cache-types/connect_ca_leaf.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package cachetype

import (
"crypto/sha256"
"errors"
"fmt"
"sync"
Expand All @@ -27,6 +28,19 @@ type ConnectCALeaf struct {
Cache *cache.Cache // Cache that has CA root certs via ConnectCARoot
}

// issuedKey returns the issuedCerts cache key for a given service and token. We
// use a hash rather than concatenating strings to provide resilience against
// user input containing our separator - both service name and token ID can be
// freely manipulated by user so may contain any delimiter we choose. It also
// has the benefit of not leaking the ACL token to a new place in memory it
// might get accidentally dumped etc.
func issuedKey(service, token string) string {
hash := sha256.New()
hash.Write([]byte(service))
hash.Write([]byte(token))
return fmt.Sprintf("%x", hash.Sum(nil))
}

func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache.FetchResult, error) {
var result cache.FetchResult

Expand All @@ -48,11 +62,15 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache
newRootCACh := make(chan error, 1)
go c.waitNewRootCA(reqReal.Datacenter, newRootCACh, opts.Timeout)

// Generate a cache key to lookup/store the cert. We MUST generate a new cert
// per token used to ensure revocation by ACL token is robust.
issuedKey := issuedKey(reqReal.Service, reqReal.Token)

// Get our prior cert (if we had one) and use that to determine our
// expiration time. If no cert exists, we expire immediately since we
// need to generate.
c.issuedCertsLock.RLock()
lastCert := c.issuedCerts[reqReal.Service]
lastCert := c.issuedCerts[issuedKey]
c.issuedCertsLock.RUnlock()

var leafExpiryCh <-chan time.Time
Expand All @@ -62,6 +80,19 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache
if expiryDur := lastCert.ValidBefore.Sub(time.Now()); expiryDur > 0 {
leafExpiryCh = time.After(expiryDur - 1*time.Hour)
// TODO(mitchellh): 1 hour buffer is hardcoded above

// We should not depend on the cache package de-duplicating requests for
// the same service/token (which is all we care about keying our local
// issued cert cache on) since it might later make sense to partition
// clients for other reasons too. So if the request has a 0 MinIndex, and
// the cached cert is still valid, then the client is expecting an
// immediate response and hasn't already seen the cached cert, return it
// now.
if opts.MinIndex == 0 {
result.Value = lastCert
result.Index = lastCert.ModifyIndex
return result, nil
}
}
}

Expand Down Expand Up @@ -149,13 +180,13 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache
// check just in case.
c.issuedCertsLock.Lock()
defer c.issuedCertsLock.Unlock()
lastCert = c.issuedCerts[reqReal.Service]
lastCert = c.issuedCerts[issuedKey]
if lastCert == nil || lastCert.ModifyIndex < reply.ModifyIndex {
if c.issuedCerts == nil {
c.issuedCerts = make(map[string]*structs.IssuedCert)
}

c.issuedCerts[reqReal.Service] = &reply
c.issuedCerts[issuedKey] = &reply
lastCert = &reply
}

Expand Down
184 changes: 184 additions & 0 deletions agent/cache-types/connect_ca_leaf_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ func TestConnectCALeaf_changingRoots(t *testing.T) {
}

// Second fetch should block with set index
opts.MinIndex = 1
fetchCh = TestFetchCh(t, typ, opts, req)
select {
case result := <-fetchCh:
Expand Down Expand Up @@ -156,6 +157,7 @@ func TestConnectCALeaf_expiringLeaf(t *testing.T) {

// Third fetch should block since the cert is not expiring and
// we also didn't update CA certs.
opts.MinIndex = 2
fetchCh = TestFetchCh(t, typ, opts, req)
select {
case result := <-fetchCh:
Expand All @@ -164,6 +166,188 @@ func TestConnectCALeaf_expiringLeaf(t *testing.T) {
}
}

// Test that once one client (e.g. the proxycfg.Manager) has fetched a cert,
// that subsequent clients get it returned immediately and don't block until it
// expires or their request times out. Note that typically FEtches at this level
// are de-duped by the cache higher up, but if the two clients are using
// different ACL tokens for example (common) that may not be the case, and we
// should wtill deliver correct blocking semantics to both.
//
// Additionally, we want to make sure that clients with different tokens
// generate distinct certs since we might later want to revoke all certs fetched
// with a given token but can't if a client using that token was served a cert
// generated under a different token (say the agent token).
func TestConnectCALeaf_multipleClientsDifferentTokens(t *testing.T) {
t.Parallel()

require := require.New(t)
rpc := TestRPC(t)
defer rpc.AssertExpectations(t)

typ, rootsCh := testCALeafType(t, rpc)
defer close(rootsCh)
rootsCh <- structs.IndexedCARoots{
ActiveRootID: "1",
TrustDomain: "fake-trust-domain.consul",
QueryMeta: structs.QueryMeta{Index: 1},
}

// Instrument ConnectCA.Sign to
var resp *structs.IssuedCert
var idx uint64
rpc.On("RPC", "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil).
Run(func(args mock.Arguments) {
reply := args.Get(2).(*structs.IssuedCert)
reply.CreateIndex = atomic.AddUint64(&idx, 1)
reply.ModifyIndex = reply.CreateIndex
reply.ValidBefore = time.Now().Add(12 * time.Hour)
resp = reply
})

// We'll reuse the fetch options and request
opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Minute}
reqA := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web", Token: "A-token"}
reqB := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web", Token: "B-token"}

// First fetch (Client A, MinIndex = 0) should return immediately
fetchCh := TestFetchCh(t, typ, opts, reqA)
var certA *structs.IssuedCert
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-fetchCh:
require.Equal(cache.FetchResult{
Value: resp,
Index: 1,
}, result)
certA = result.(cache.FetchResult).Value.(*structs.IssuedCert)
}

// Second fetch (Client B, MinIndex = 0) should return immediately
fetchCh = TestFetchCh(t, typ, opts, reqB)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-fetchCh:
require.Equal(cache.FetchResult{
Value: resp,
Index: 2,
}, result)
// Different tokens should result in different certs. Note that we don't
// actually generate and sign real certs in this test with our mock RPC but
// this is enough to be sure we actually generated a different Private Key
// for each one and aren't just differnt due to index values.
require.NotEqual(certA.PrivateKeyPEM,
result.(cache.FetchResult).Value.(*structs.IssuedCert).PrivateKeyPEM)
}

// Third fetch (Client A, MinIndex = > 0) should block
opts.MinIndex = 2
fetchCh = TestFetchCh(t, typ, opts, reqA)
select {
case result := <-fetchCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}

// Fourth fetch (Client B, MinIndex = > 0) should block
fetchCh = TestFetchCh(t, typ, opts, reqB)
select {
case result := <-fetchCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}
}

// Test that once one client (e.g. the proxycfg.Manager) has fetched a cert,
// that subsequent clients get it returned immediately and don't block until it
// expires or their request times out. Note that typically Fetches at this level
// are de-duped by the cache higher up, the test above explicitly tests the case
// where two clients with different tokens request the same cert. However two
// clients sharing a token _may_ share the certificate, but the cachetype should
// not implicitly depend on the cache mechanism de-duping these clients.
//
// Genrally we _shouldn't_ rely on implementation details in the cache package
// about partitioning to behave correctly as that is likely to lead to subtle
// errors later when the implementation there changes, so this test ensures that
// even if the cache for some reason decides to not share an existing cache
// entry with a second client despite using the same token, that we don't block
// it's initial request assuming that it's already recieved the in-memory and
// still valid cert.
func TestConnectCALeaf_multipleClientsSameToken(t *testing.T) {
t.Parallel()

require := require.New(t)
rpc := TestRPC(t)
defer rpc.AssertExpectations(t)

typ, rootsCh := testCALeafType(t, rpc)
defer close(rootsCh)
rootsCh <- structs.IndexedCARoots{
ActiveRootID: "1",
TrustDomain: "fake-trust-domain.consul",
QueryMeta: structs.QueryMeta{Index: 1},
}

// Instrument ConnectCA.Sign to
var resp *structs.IssuedCert
var idx uint64
rpc.On("RPC", "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil).
Run(func(args mock.Arguments) {
reply := args.Get(2).(*structs.IssuedCert)
reply.CreateIndex = atomic.AddUint64(&idx, 1)
reply.ModifyIndex = reply.CreateIndex
reply.ValidBefore = time.Now().Add(12 * time.Hour)
resp = reply
})

// We'll reuse the fetch options and request
opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Minute}
reqA := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web", Token: "shared-token"}
reqB := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web", Token: "shared-token"}

// First fetch (Client A, MinIndex = 0) should return immediately
fetchCh := TestFetchCh(t, typ, opts, reqA)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-fetchCh:
require.Equal(cache.FetchResult{
Value: resp,
Index: 1,
}, result)
}

// Second fetch (Client B, MinIndex = 0) should return immediately
fetchCh = TestFetchCh(t, typ, opts, reqB)
select {
case <-time.After(100 * time.Millisecond):
t.Fatal("shouldn't block waiting for fetch")
case result := <-fetchCh:
require.Equal(cache.FetchResult{
Value: resp,
Index: 1, // Same result as last fetch
}, result)
}

// Third fetch (Client A, MinIndex = > 0) should block
opts.MinIndex = 1
fetchCh = TestFetchCh(t, typ, opts, reqA)
select {
case result := <-fetchCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}

// Fourth fetch (Client B, MinIndex = > 0) should block
fetchCh = TestFetchCh(t, typ, opts, reqB)
select {
case result := <-fetchCh:
t.Fatalf("should not return: %#v", result)
case <-time.After(100 * time.Millisecond):
}
}

// testCALeafType returns a *ConnectCALeaf that is pre-configured to
// use the given RPC implementation for "ConnectCA.Sign" operations.
func testCALeafType(t *testing.T, rpc RPC) (*ConnectCALeaf, chan structs.IndexedCARoots) {
Expand Down