diff --git a/agent/cache-types/connect_ca_leaf.go b/agent/cache-types/connect_ca_leaf.go index 6fc88a847714..b85beb4c2632 100644 --- a/agent/cache-types/connect_ca_leaf.go +++ b/agent/cache-types/connect_ca_leaf.go @@ -1,6 +1,7 @@ package cachetype import ( + "crypto/sha256" "errors" "fmt" "sync" @@ -27,6 +28,19 @@ type ConnectCALeaf struct { Cache *cache.Cache // Cache that has CA root certs via ConnectCARoot } +// issuedKey returns the issuedCerts cache key for a given service and token. We +// use a hash rather than concatenating strings to provide resilience against +// user input containing our separator - both service name and token ID can be +// freely manipulated by user so may contain any delimiter we choose. It also +// has the benefit of not leaking the ACL token to a new place in memory it +// might get accidentally dumped etc. +func issuedKey(service, token string) string { + hash := sha256.New() + hash.Write([]byte(service)) + hash.Write([]byte(token)) + return fmt.Sprintf("%x", hash.Sum(nil)) +} + func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache.FetchResult, error) { var result cache.FetchResult @@ -48,11 +62,15 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache newRootCACh := make(chan error, 1) go c.waitNewRootCA(reqReal.Datacenter, newRootCACh, opts.Timeout) + // Generate a cache key to lookup/store the cert. We MUST generate a new cert + // per token used to ensure revocation by ACL token is robust. + issuedKey := issuedKey(reqReal.Service, reqReal.Token) + // Get our prior cert (if we had one) and use that to determine our // expiration time. If no cert exists, we expire immediately since we // need to generate. c.issuedCertsLock.RLock() - lastCert := c.issuedCerts[reqReal.Service] + lastCert := c.issuedCerts[issuedKey] c.issuedCertsLock.RUnlock() var leafExpiryCh <-chan time.Time @@ -62,6 +80,19 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache if expiryDur := lastCert.ValidBefore.Sub(time.Now()); expiryDur > 0 { leafExpiryCh = time.After(expiryDur - 1*time.Hour) // TODO(mitchellh): 1 hour buffer is hardcoded above + + // We should not depend on the cache package de-duplicating requests for + // the same service/token (which is all we care about keying our local + // issued cert cache on) since it might later make sense to partition + // clients for other reasons too. So if the request has a 0 MinIndex, and + // the cached cert is still valid, then the client is expecting an + // immediate response and hasn't already seen the cached cert, return it + // now. + if opts.MinIndex == 0 { + result.Value = lastCert + result.Index = lastCert.ModifyIndex + return result, nil + } } } @@ -149,13 +180,13 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache // check just in case. c.issuedCertsLock.Lock() defer c.issuedCertsLock.Unlock() - lastCert = c.issuedCerts[reqReal.Service] + lastCert = c.issuedCerts[issuedKey] if lastCert == nil || lastCert.ModifyIndex < reply.ModifyIndex { if c.issuedCerts == nil { c.issuedCerts = make(map[string]*structs.IssuedCert) } - c.issuedCerts[reqReal.Service] = &reply + c.issuedCerts[issuedKey] = &reply lastCert = &reply } diff --git a/agent/cache-types/connect_ca_leaf_test.go b/agent/cache-types/connect_ca_leaf_test.go index d55caf408dce..c52c8ce02009 100644 --- a/agent/cache-types/connect_ca_leaf_test.go +++ b/agent/cache-types/connect_ca_leaf_test.go @@ -58,6 +58,7 @@ func TestConnectCALeaf_changingRoots(t *testing.T) { } // Second fetch should block with set index + opts.MinIndex = 1 fetchCh = TestFetchCh(t, typ, opts, req) select { case result := <-fetchCh: @@ -156,6 +157,7 @@ func TestConnectCALeaf_expiringLeaf(t *testing.T) { // Third fetch should block since the cert is not expiring and // we also didn't update CA certs. + opts.MinIndex = 2 fetchCh = TestFetchCh(t, typ, opts, req) select { case result := <-fetchCh: @@ -164,6 +166,188 @@ func TestConnectCALeaf_expiringLeaf(t *testing.T) { } } +// Test that once one client (e.g. the proxycfg.Manager) has fetched a cert, +// that subsequent clients get it returned immediately and don't block until it +// expires or their request times out. Note that typically FEtches at this level +// are de-duped by the cache higher up, but if the two clients are using +// different ACL tokens for example (common) that may not be the case, and we +// should wtill deliver correct blocking semantics to both. +// +// Additionally, we want to make sure that clients with different tokens +// generate distinct certs since we might later want to revoke all certs fetched +// with a given token but can't if a client using that token was served a cert +// generated under a different token (say the agent token). +func TestConnectCALeaf_multipleClientsDifferentTokens(t *testing.T) { + t.Parallel() + + require := require.New(t) + rpc := TestRPC(t) + defer rpc.AssertExpectations(t) + + typ, rootsCh := testCALeafType(t, rpc) + defer close(rootsCh) + rootsCh <- structs.IndexedCARoots{ + ActiveRootID: "1", + TrustDomain: "fake-trust-domain.consul", + QueryMeta: structs.QueryMeta{Index: 1}, + } + + // Instrument ConnectCA.Sign to + var resp *structs.IssuedCert + var idx uint64 + rpc.On("RPC", "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil). + Run(func(args mock.Arguments) { + reply := args.Get(2).(*structs.IssuedCert) + reply.CreateIndex = atomic.AddUint64(&idx, 1) + reply.ModifyIndex = reply.CreateIndex + reply.ValidBefore = time.Now().Add(12 * time.Hour) + resp = reply + }) + + // We'll reuse the fetch options and request + opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Minute} + reqA := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web", Token: "A-token"} + reqB := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web", Token: "B-token"} + + // First fetch (Client A, MinIndex = 0) should return immediately + fetchCh := TestFetchCh(t, typ, opts, reqA) + var certA *structs.IssuedCert + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-fetchCh: + require.Equal(cache.FetchResult{ + Value: resp, + Index: 1, + }, result) + certA = result.(cache.FetchResult).Value.(*structs.IssuedCert) + } + + // Second fetch (Client B, MinIndex = 0) should return immediately + fetchCh = TestFetchCh(t, typ, opts, reqB) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-fetchCh: + require.Equal(cache.FetchResult{ + Value: resp, + Index: 2, + }, result) + // Different tokens should result in different certs. Note that we don't + // actually generate and sign real certs in this test with our mock RPC but + // this is enough to be sure we actually generated a different Private Key + // for each one and aren't just differnt due to index values. + require.NotEqual(certA.PrivateKeyPEM, + result.(cache.FetchResult).Value.(*structs.IssuedCert).PrivateKeyPEM) + } + + // Third fetch (Client A, MinIndex = > 0) should block + opts.MinIndex = 2 + fetchCh = TestFetchCh(t, typ, opts, reqA) + select { + case result := <-fetchCh: + t.Fatalf("should not return: %#v", result) + case <-time.After(100 * time.Millisecond): + } + + // Fourth fetch (Client B, MinIndex = > 0) should block + fetchCh = TestFetchCh(t, typ, opts, reqB) + select { + case result := <-fetchCh: + t.Fatalf("should not return: %#v", result) + case <-time.After(100 * time.Millisecond): + } +} + +// Test that once one client (e.g. the proxycfg.Manager) has fetched a cert, +// that subsequent clients get it returned immediately and don't block until it +// expires or their request times out. Note that typically Fetches at this level +// are de-duped by the cache higher up, the test above explicitly tests the case +// where two clients with different tokens request the same cert. However two +// clients sharing a token _may_ share the certificate, but the cachetype should +// not implicitly depend on the cache mechanism de-duping these clients. +// +// Genrally we _shouldn't_ rely on implementation details in the cache package +// about partitioning to behave correctly as that is likely to lead to subtle +// errors later when the implementation there changes, so this test ensures that +// even if the cache for some reason decides to not share an existing cache +// entry with a second client despite using the same token, that we don't block +// it's initial request assuming that it's already recieved the in-memory and +// still valid cert. +func TestConnectCALeaf_multipleClientsSameToken(t *testing.T) { + t.Parallel() + + require := require.New(t) + rpc := TestRPC(t) + defer rpc.AssertExpectations(t) + + typ, rootsCh := testCALeafType(t, rpc) + defer close(rootsCh) + rootsCh <- structs.IndexedCARoots{ + ActiveRootID: "1", + TrustDomain: "fake-trust-domain.consul", + QueryMeta: structs.QueryMeta{Index: 1}, + } + + // Instrument ConnectCA.Sign to + var resp *structs.IssuedCert + var idx uint64 + rpc.On("RPC", "ConnectCA.Sign", mock.Anything, mock.Anything).Return(nil). + Run(func(args mock.Arguments) { + reply := args.Get(2).(*structs.IssuedCert) + reply.CreateIndex = atomic.AddUint64(&idx, 1) + reply.ModifyIndex = reply.CreateIndex + reply.ValidBefore = time.Now().Add(12 * time.Hour) + resp = reply + }) + + // We'll reuse the fetch options and request + opts := cache.FetchOptions{MinIndex: 0, Timeout: 10 * time.Minute} + reqA := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web", Token: "shared-token"} + reqB := &ConnectCALeafRequest{Datacenter: "dc1", Service: "web", Token: "shared-token"} + + // First fetch (Client A, MinIndex = 0) should return immediately + fetchCh := TestFetchCh(t, typ, opts, reqA) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-fetchCh: + require.Equal(cache.FetchResult{ + Value: resp, + Index: 1, + }, result) + } + + // Second fetch (Client B, MinIndex = 0) should return immediately + fetchCh = TestFetchCh(t, typ, opts, reqB) + select { + case <-time.After(100 * time.Millisecond): + t.Fatal("shouldn't block waiting for fetch") + case result := <-fetchCh: + require.Equal(cache.FetchResult{ + Value: resp, + Index: 1, // Same result as last fetch + }, result) + } + + // Third fetch (Client A, MinIndex = > 0) should block + opts.MinIndex = 1 + fetchCh = TestFetchCh(t, typ, opts, reqA) + select { + case result := <-fetchCh: + t.Fatalf("should not return: %#v", result) + case <-time.After(100 * time.Millisecond): + } + + // Fourth fetch (Client B, MinIndex = > 0) should block + fetchCh = TestFetchCh(t, typ, opts, reqB) + select { + case result := <-fetchCh: + t.Fatalf("should not return: %#v", result) + case <-time.After(100 * time.Millisecond): + } +} + // testCALeafType returns a *ConnectCALeaf that is pre-configured to // use the given RPC implementation for "ConnectCA.Sign" operations. func testCALeafType(t *testing.T, rpc RPC) (*ConnectCALeaf, chan structs.IndexedCARoots) {