Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

connect: connect CA Roots in secondary datacenters should use a SigningKeyID derived from their local intermediate #6513

Merged
merged 2 commits into from
Sep 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 200 additions & 5 deletions agent/agent_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"os"
"reflect"
"sort"
"strconv"
"strings"
"testing"
"time"
Expand Down Expand Up @@ -4717,15 +4718,209 @@ func TestAgentConnectCALeafCert_goodNotLocal(t *testing.T) {
}
}

func requireLeafValidUnderCA(t *testing.T, issued *structs.IssuedCert,
ca *structs.CARoot) {
func TestAgentConnectCALeafCert_secondaryDC_good(t *testing.T) {
t.Parallel()

assert := assert.New(t)
require := require.New(t)

a1 := NewTestAgent(t, t.Name()+"-dc1", `
datacenter = "dc1"
primary_datacenter = "dc1"
`)
defer a1.Shutdown()
testrpc.WaitForTestAgent(t, a1.RPC, "dc1")

a2 := NewTestAgent(t, t.Name()+"-dc2", `
datacenter = "dc2"
primary_datacenter = "dc1"
`)
defer a2.Shutdown()
testrpc.WaitForTestAgent(t, a2.RPC, "dc2")

// Wait for the WAN join.
addr := fmt.Sprintf("127.0.0.1:%d", a1.Config.SerfPortWAN)
_, err := a2.JoinWAN([]string{addr})
require.NoError(err)

testrpc.WaitForLeader(t, a1.RPC, "dc1")
testrpc.WaitForLeader(t, a2.RPC, "dc2")
retry.Run(t, func(r *retry.R) {
if got, want := len(a1.WANMembers()), 2; got < want {
r.Fatalf("got %d WAN members want at least %d", got, want)
}
})

// CA already setup by default by NewTestAgent but force a new one so we can
// verify it was signed easily.
dc1_ca1 := connect.TestCAConfigSet(t, a1, nil)

// Wait until root is updated in both dcs.
waitForActiveCARoot(t, a1.srv, dc1_ca1)
waitForActiveCARoot(t, a2.srv, dc1_ca1)

{
// Register a local service in the SECONDARY
args := &structs.ServiceDefinition{
ID: "foo",
Name: "test",
Address: "127.0.0.1",
Port: 8000,
Check: structs.CheckType{
TTL: 15 * time.Second,
},
}
req, _ := http.NewRequest("PUT", "/v1/agent/service/register", jsonReader(args))
resp := httptest.NewRecorder()
_, err := a2.srv.AgentRegisterService(resp, req)
require.NoError(err)
if !assert.Equal(200, resp.Code) {
t.Log("Body: ", resp.Body.String())
}
}

// List
req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test", nil)
resp := httptest.NewRecorder()
obj, err := a2.srv.AgentConnectCALeafCert(resp, req)
require.NoError(err)
require.Equal("MISS", resp.Header().Get("X-Cache"))

// Get the issued cert
issued, ok := obj.(*structs.IssuedCert)
assert.True(ok)

// Verify that the cert is signed by the CA
requireLeafValidUnderCA(t, issued, dc1_ca1)

// Verify blocking index
assert.True(issued.ModifyIndex > 0)
assert.Equal(fmt.Sprintf("%d", issued.ModifyIndex),
resp.Header().Get("X-Consul-Index"))

// Test caching
{
// Fetch it again
resp := httptest.NewRecorder()
obj2, err := a2.srv.AgentConnectCALeafCert(resp, req)
require.NoError(err)
require.Equal(obj, obj2)

// Should cache hit this time and not make request
require.Equal("HIT", resp.Header().Get("X-Cache"))
}

// Test that we aren't churning leaves for no reason at idle.
{
ch := make(chan error, 1)
go func() {
req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/leaf/test?index="+strconv.Itoa(int(issued.ModifyIndex)), nil)
resp := httptest.NewRecorder()
obj, err := a2.srv.AgentConnectCALeafCert(resp, req)
if err != nil {
ch <- err
} else {
issued2 := obj.(*structs.IssuedCert)
if issued.CertPEM == issued2.CertPEM {
ch <- fmt.Errorf("leaf woke up unexpectedly with same cert")
} else {
ch <- fmt.Errorf("leaf woke up unexpectedly with new cert")
}
}
}()

start := time.Now()

// Before applying the fix from PR-6513 this would reliably wake up
// after ~20ms with a new cert. Since this test is necessarily a bit
// timing dependent we'll chill out for 5 seconds which should be enough
// time to disprove the original bug.
select {
case <-time.After(5 * time.Second):
case err := <-ch:
dur := time.Since(start)
t.Fatalf("unexpected return from blocking query; leaf churned during idle period, took %s: %v", dur, err)
}
}

// Set a new CA
dc1_ca2 := connect.TestCAConfigSet(t, a2, nil)

// Wait until root is updated in both dcs.
waitForActiveCARoot(t, a1.srv, dc1_ca2)
waitForActiveCARoot(t, a2.srv, dc1_ca2)

// Test that caching is updated in the background
retry.Run(t, func(r *retry.R) {
resp := httptest.NewRecorder()
// Try and sign again (note no index/wait arg since cache should update in
// background even if we aren't actively blocking)
obj, err := a2.srv.AgentConnectCALeafCert(resp, req)
r.Check(err)

issued2 := obj.(*structs.IssuedCert)
if issued.CertPEM == issued2.CertPEM {
r.Fatalf("leaf has not updated")
}

// Got a new leaf. Sanity check it's a whole new key as well as different
// cert.
if issued.PrivateKeyPEM == issued2.PrivateKeyPEM {
r.Fatalf("new leaf has same private key as before")
}

// Verify that the cert is signed by the new CA
requireLeafValidUnderCA(t, issued2, dc1_ca2)

// Should be a cache hit! The data should've updated in the cache
// in the background so this should've been fetched directly from
// the cache.
if resp.Header().Get("X-Cache") != "HIT" {
r.Fatalf("should be a cache hit")
}
})
}

func waitForActiveCARoot(t *testing.T, srv *HTTPServer, expect *structs.CARoot) {
retry.Run(t, func(r *retry.R) {
req, _ := http.NewRequest("GET", "/v1/agent/connect/ca/roots", nil)
resp := httptest.NewRecorder()
obj, err := srv.AgentConnectCARoots(resp, req)
if err != nil {
r.Fatalf("err: %v", err)
}

roots, ok := obj.(structs.IndexedCARoots)
if !ok {
r.Fatalf("response is wrong type %T", obj)
}

var root *structs.CARoot
for _, r := range roots.Roots {
if r.ID == roots.ActiveRootID {
root = r
break
}
}
if root == nil {
r.Fatal("no active root")
}
if root.ID != expect.ID {
r.Fatalf("current active root is %s; waiting for %s", root.ID, expect.ID)
}
})
}

func requireLeafValidUnderCA(t *testing.T, issued *structs.IssuedCert, ca *structs.CARoot) {
leaf, intermediates, err := connect.ParseLeafCerts(issued.CertPEM)
require.NoError(t, err)

roots := x509.NewCertPool()
require.True(t, roots.AppendCertsFromPEM([]byte(ca.RootCert)))
leaf, err := connect.ParseCert(issued.CertPEM)
require.NoError(t, err)

_, err = leaf.Verify(x509.VerifyOptions{
Roots: roots,
Roots: roots,
Intermediates: intermediates,
})
require.NoError(t, err)

Expand Down
8 changes: 4 additions & 4 deletions agent/cache-types/connect_ca_leaf.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,9 @@ type ConnectCALeaf struct {
// since all times we get from our wall clock should point to the same Location
// anyway.
type fetchState struct {
// authorityKeyID is the key ID of the CA root that signed the current cert.
// This is just to save parsing the whole cert everytime we have to check if
// the root changed.
// authorityKeyId is the ID of the CA key (whether root or intermediate) that signed
// the current cert. This is just to save parsing the whole cert everytime
// we have to check if the root changed.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// we have to check if the root changed.
// we have to check if the signing key changed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 actually I'm not sure this is quite right.

The only reason we care about rotating the certificate in this case is if the actual root changes. If for some reason a new intermediate was generated but for the same root, we don't need to rotate certs signed by the old intermediate because the will still validate against the same root cert.

We only care about rotating if the actual root key is different.

So this was kinda more correct before, but I think we misguidedly built the logic to look at the (intermediate) signing key assuming that was the right thing to do for intermediates and so broke this assumption.

So I think this PR is more correct than before it because it at least behaves well and has the same set of assumptions in the logic and the data we are propagating.

But actually in some sense it's not necessary to rotate if only the intermediate is changing and it would be more correct to only compare against the actual root key even when an intermediate is being used.

I think that boils down to SigningKeyID being misnamed in the CARoot since the thing we care about it more like RootKeyID.

What do you think? This is better than no change for sure but should we go back and make this all correct and optimal instead? Or am I missing a reason you would need to rotate if intermediate changed too?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To play devil's advocate... the only time intermediates should change in Connect is if roots do too currently which possibly makes this argument moot on an optimality point of view...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to rotate the cert even when the intermediate changes. This could use verification but I am pretty sure we have to push both the root and intermediate cert into Envoy for verification. It has to have the whole chain and if the leaf cert was signed by an unknown intermediate (an old one that we have since regenerated) then I don't think the certs will validate and connections will be dropped.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So unless there's something non obvious here, I can't see a field on a leaf cert that expresses a link back to the primary root, only a link to the intermediate that signed it.

We could walk the linked list of intermediates back to the primary root in the leaf caching code and compare the leaf.SigningKeyID->intermediate, intermediate->root, root=?=prevRoot but that would require unpacking the whole chain of certs every time the leaf cache Fetch is called.

But as you said given that intermediates rotate only when the primary root rotates (or possibly in rare other circumstances), we can just rely upon waiting until the intermediates rotate first because they detect that the primary root rotated.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mkeeler the intermediate is delivered by the client Envoy along with the leaf. As long as the intermediate is still valid (i.e. not revoked and we don't have revocation checking yet..) then it's fine as long as it was signed by the same root as the current intermediate.

That said, this will be more important if we allow revocation of intermediates in future.

So unless there's something non obvious here, I can't see a field on a leaf cert that expresses a link back to the primary root, only a link to the intermediate that signed it.

Yep that is the root (pardon the pun) of the confusion here - it's my mistake as I thought the signing key was the important thing but then only verified against the root. We could instead have store the key ID of the root in that struct and just compared that to the current root instead.

That said on the basis of the future-proofing against intermediate revocation and my indication that it makes not actual difference currently I think this is good as it is.

authorityKeyID string

// forceExpireAfter is used to coordinate renewing certs after a CA rotation
Expand Down Expand Up @@ -362,7 +362,7 @@ func (c *ConnectCALeaf) Fetch(opts cache.FetchOptions, req cache.Request) (cache
expiresAt = state.forceExpireAfter
}

if expiresAt == now || expiresAt.Before(now) {
if expiresAt.Equal(now) || expiresAt.Before(now) {
// Already expired, just make a new one right away
return c.generateNewLeaf(reqReal, lastResultWithNewState())
}
Expand Down
61 changes: 61 additions & 0 deletions agent/connect/parsing.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,67 @@ func ParseCert(pemValue string) (*x509.Certificate, error) {
return x509.ParseCertificate(block.Bytes)
}

// ParseLeafCerts parses all of the x509 certificates from a PEM-encoded value
// under the assumption that the first cert is a leaf (non-CA) cert and the
// rest are intermediate CA certs.
//
// If no certificates are found this returns an error.
func ParseLeafCerts(pemValue string) (*x509.Certificate, *x509.CertPool, error) {
certs, err := parseCerts(pemValue)
if err != nil {
return nil, nil, err
}

leaf := certs[0]
if leaf.IsCA {
return nil, nil, fmt.Errorf("first PEM-block should be a leaf cert")
}

intermediates := x509.NewCertPool()
for _, cert := range certs[1:] {
if !cert.IsCA {
return nil, nil, fmt.Errorf("found an unexpected leaf cert after the first PEM-block")
}
intermediates.AddCert(cert)
}

return leaf, intermediates, nil
}

// ParseCerts parses the all x509 certificates from a PEM-encoded value.
// The first returned cert is a leaf cert and any other ones are intermediates.
//
// If no certificates are found this returns an error.
func parseCerts(pemValue string) ([]*x509.Certificate, error) {
var out []*x509.Certificate

rest := []byte(pemValue)
for {
// The _ result below is not an error but the remaining PEM bytes.
block, remaining := pem.Decode(rest)
if block == nil {
break
}
rest = remaining

if block.Type != "CERTIFICATE" {
return nil, fmt.Errorf("PEM-block should be CERTIFICATE type")
}

cert, err := x509.ParseCertificate(block.Bytes)
if err != nil {
return nil, err
}
out = append(out, cert)
}

if len(out) == 0 {
return nil, fmt.Errorf("no PEM-encoded data found")
}

return out, nil
}

// CalculateCertFingerprint parses the x509 certificate from a PEM-encoded value
// and calculates the SHA-1 fingerprint.
func CalculateCertFingerprint(pemValue string) (string, error) {
Expand Down
30 changes: 28 additions & 2 deletions agent/consul/leader_connect.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,10 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, roots structs.Index
return err
}

var storedRootID string
var (
storedRootID string
expectedSigningKeyID string
)
if activeIntermediate != "" {
storedRoot, err := provider.ActiveRoot()
if err != nil {
Expand All @@ -301,6 +304,12 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, roots structs.Index
if err != nil {
return fmt.Errorf("error parsing root fingerprint: %v, %#v", err, roots)
}

intermediateCert, err := connect.ParseCert(activeIntermediate)
if err != nil {
return fmt.Errorf("error parsing active intermediate cert: %v", err)
}
expectedSigningKeyID = connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId)
}

var newActiveRoot *structs.CARoot
Expand All @@ -314,10 +323,21 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, roots structs.Index
return fmt.Errorf("primary datacenter does not have an active root CA for Connect")
}

newIntermediate := false
// Get a signed intermediate from the primary DC if the provider
// hasn't been initialized yet or if the primary's root has changed.
needsNewIntermediate := false
if activeIntermediate == "" || storedRootID != roots.ActiveRootID {
needsNewIntermediate = true
}

// Also we take this opportunity to correct an incorrectly persisted SigningKeyID
// in secondary datacenters (see PR-6513).
if expectedSigningKeyID != "" && newActiveRoot.SigningKeyID != expectedSigningKeyID {
needsNewIntermediate = true
}

newIntermediate := false
if needsNewIntermediate {
csr, err := provider.GenerateIntermediateCSR()
if err != nil {
return err
Expand All @@ -334,8 +354,14 @@ func (s *Server) initializeSecondaryCA(provider ca.Provider, roots structs.Index
return fmt.Errorf("Failed to set the intermediate certificate with the CA provider: %v", err)
}

intermediateCert, err := connect.ParseCert(intermediatePEM)
if err != nil {
return fmt.Errorf("error parsing intermediate cert: %v", err)
}

// Append the new intermediate to our local active root entry.
newActiveRoot.IntermediateCerts = append(newActiveRoot.IntermediateCerts, intermediatePEM)
newActiveRoot.SigningKeyID = connect.EncodeSigningKeyID(intermediateCert.SubjectKeyId)
newIntermediate = true

s.logger.Printf("[INFO] connect: received new intermediate certificate from primary datacenter")
Expand Down
Loading