diff --git a/agent/agent.go b/agent/agent.go index 4a8f431e5023..ab6bfac98d39 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -840,7 +840,7 @@ func (a *Agent) setupClientAutoEncrypt(ctx context.Context) (*structs.SignedResp } addrs = append(addrs, retryJoinAddrs(disco, retryJoinSerfVariant, "LAN", a.config.RetryJoinLAN, a.logger)...) - reply, priv, err := client.RequestAutoEncryptCerts(ctx, addrs, a.config.ServerPort, a.tokens.AgentToken()) + reply, priv, err := client.RequestAutoEncryptCerts(ctx, addrs, a.config.ServerPort, a.tokens.AgentToken(), a.config.AutoEncryptDNSSAN, a.config.AutoEncryptIPSAN) if err != nil { return nil, err } @@ -877,7 +877,17 @@ func (a *Agent) setupClientAutoEncryptCache(reply *structs.SignedResponse) (*str } // prepolutate leaf cache - certRes := cache.FetchResult{Value: &reply.IssuedCert, Index: reply.ConnectCARoots.QueryMeta.Index} + certRes := cache.FetchResult{ + Value: &reply.IssuedCert, + Index: reply.ConnectCARoots.QueryMeta.Index, + } + + for _, ca := range reply.ConnectCARoots.Roots { + if ca.ID == reply.ConnectCARoots.ActiveRootID { + certRes.State = cachetype.ConnectCALeafSuccess(ca.SigningKeyID) + break + } + } if err := a.cache.Prepopulate(cachetype.ConnectCALeafName, certRes, a.config.Datacenter, a.tokens.AgentToken(), leafReq.Key()); err != nil { return nil, nil, err } diff --git a/agent/agent_test.go b/agent/agent_test.go index a311c42d7202..a55250539468 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "crypto/tls" + "crypto/x509" "encoding/base64" "encoding/json" "fmt" @@ -4634,7 +4635,7 @@ func TestAutoConfig_Integration(t *testing.T) { }) require.NoError(t, err) - client := StartTestAgent(t, TestAgent{Name: "test-client", HCL: TestACLConfigWithParams(nil) + ` + client := StartTestAgent(t, TestAgent{Name: "test-client", HCL: ` bootstrap = false server = false ca_file = "` + caFile + `" @@ -4660,3 +4661,78 @@ func TestAutoConfig_Integration(t *testing.T) { // spot check that we now have an ACL token require.NotEmpty(t, client.tokens.AgentToken()) } + +func TestAgent_AutoEncrypt(t *testing.T) { + // eventually this test should really live with integration tests + // the goal here is to have one test server and another test client + // spin up both agents and allow the server to authorize the auto encrypt + // request and then see the client get a TLS certificate + cfgDir := testutil.TempDir(t, "auto-encrypt") + + // write some test TLS certificates out to the cfg dir + cert, key, cacert, err := testTLSCertificates("server.dc1.consul") + require.NoError(t, err) + + certFile := filepath.Join(cfgDir, "cert.pem") + caFile := filepath.Join(cfgDir, "cacert.pem") + keyFile := filepath.Join(cfgDir, "key.pem") + + require.NoError(t, ioutil.WriteFile(certFile, []byte(cert), 0600)) + require.NoError(t, ioutil.WriteFile(caFile, []byte(cacert), 0600)) + require.NoError(t, ioutil.WriteFile(keyFile, []byte(key), 0600)) + + hclConfig := TestACLConfigWithParams(nil) + ` + verify_incoming = true + verify_outgoing = true + verify_server_hostname = true + ca_file = "` + caFile + `" + cert_file = "` + certFile + `" + key_file = "` + keyFile + `" + connect { enabled = true } + auto_encrypt { allow_tls = true } + ` + + srv := StartTestAgent(t, TestAgent{Name: "test-server", HCL: hclConfig}) + defer srv.Shutdown() + + testrpc.WaitForTestAgent(t, srv.RPC, "dc1", testrpc.WithToken(TestDefaultMasterToken)) + + client := StartTestAgent(t, TestAgent{Name: "test-client", HCL: TestACLConfigWithParams(nil) + ` + bootstrap = false + server = false + ca_file = "` + caFile + `" + verify_outgoing = true + verify_server_hostname = true + node_name = "test-client" + auto_encrypt { + tls = true + } + ports { + server = ` + strconv.Itoa(srv.Config.RPCBindAddr.Port) + ` + } + retry_join = ["` + srv.Config.SerfBindAddrLAN.String() + `"]`, + UseTLS: true, + }) + + defer client.Shutdown() + + // when this is successful we managed to get a TLS certificate and are using it for + // encrypted RPC connections. + testrpc.WaitForTestAgent(t, client.RPC, "dc1", testrpc.WithToken(TestDefaultMasterToken)) + + // now we need to validate that our certificate has the correct CN + aeCert := client.tlsConfigurator.Cert() + require.NotNil(t, aeCert) + + id := connect.SpiffeIDAgent{ + Host: connect.TestClusterID + ".consul", + Datacenter: "dc1", + Agent: "test-client", + } + expectedCN := connect.AgentCN("test-client", connect.TestClusterID) + x509Cert, err := x509.ParseCertificate(aeCert.Certificate[0]) + require.NoError(t, err) + require.Equal(t, expectedCN, x509Cert.Subject.CommonName) + require.Len(t, x509Cert.URIs, 1) + require.Equal(t, id.URI(), x509Cert.URIs[0]) +} diff --git a/agent/cache-types/connect_ca_leaf.go b/agent/cache-types/connect_ca_leaf.go index afda5a7822b5..c177804893a6 100644 --- a/agent/cache-types/connect_ca_leaf.go +++ b/agent/cache-types/connect_ca_leaf.go @@ -121,6 +121,15 @@ type fetchState struct { consecutiveRateLimitErrs int } +func ConnectCALeafSuccess(authorityKeyID string) interface{} { + return fetchState{ + authorityKeyID: authorityKeyID, + forceExpireAfter: time.Time{}, + consecutiveRateLimitErrs: 0, + activeRootRotationStart: time.Time{}, + } +} + // fetchStart is called on each fetch that is about to block and wait for // changes to the leaf. It subscribes a chan to receive updates from the shared // root watcher and triggers root watcher if it's not already running. @@ -532,7 +541,7 @@ func (c *ConnectCALeaf) generateNewLeaf(req *ConnectCALeafRequest, } commonName = connect.AgentCN(req.Agent, roots.TrustDomain) dnsNames = append([]string{"localhost"}, req.DNSSAN...) - ipAddresses = append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::")}, req.IPSAN...) + ipAddresses = append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")}, req.IPSAN...) } else { return result, errors.New("URI must be either service or agent") } diff --git a/agent/consul/auto_encrypt.go b/agent/consul/auto_encrypt.go index 4f176933ff15..e4bca49005a9 100644 --- a/agent/consul/auto_encrypt.go +++ b/agent/consul/auto_encrypt.go @@ -19,23 +19,7 @@ const ( retryJitterWindow = 30 * time.Second ) -func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, port int, token string) (*structs.SignedResponse, string, error) { - errFn := func(err error) (*structs.SignedResponse, string, error) { - return nil, "", err - } - - // Check if we know about a server already through gossip. Depending on - // how the agent joined, there might already be one. Also in case this - // gets called because the cert expired. - server := c.routers.FindServer() - if server != nil { - servers = []string{server.Addr.String()} - } - - if len(servers) == 0 { - return errFn(fmt.Errorf("No servers to request AutoEncrypt.Sign")) - } - +func (c *Client) autoEncryptCSR(extraDNSSANs []string, extraIPSANs []net.IP) (string, string, error) { // We don't provide the correct host here, because we don't know any // better at this point. Apart from the domain, we would need the // ClusterID, which we don't have. This is why we go with @@ -49,7 +33,7 @@ func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, conf, err := c.config.CAConfig.GetCommonConfig() if err != nil { - return errFn(err) + return "", "", err } if conf.PrivateKeyType == "" { @@ -62,11 +46,11 @@ func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, // Create a new private key pk, pkPEM, err := connect.GeneratePrivateKeyWithConfig(conf.PrivateKeyType, conf.PrivateKeyBits) if err != nil { - return errFn(err) + return "", "", err } - dnsNames := []string{"localhost"} - ipAddresses := []net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::")} + dnsNames := append([]string{"localhost"}, extraDNSSANs...) + ipAddresses := append([]net.IP{net.ParseIP("127.0.0.1"), net.ParseIP("::1")}, extraIPSANs...) // Create a CSR. // @@ -74,6 +58,31 @@ func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, // override this when it is signed anyway so it's OK. cn := connect.AgentCN(c.config.NodeName, dummyTrustDomain) csr, err := connect.CreateCSR(id, cn, pk, dnsNames, ipAddresses) + if err != nil { + return "", "", err + } + + return pkPEM, csr, nil +} + +func (c *Client) RequestAutoEncryptCerts(ctx context.Context, servers []string, port int, token string, extraDNSSANs []string, extraIPSANs []net.IP) (*structs.SignedResponse, string, error) { + errFn := func(err error) (*structs.SignedResponse, string, error) { + return nil, "", err + } + + // Check if we know about a server already through gossip. Depending on + // how the agent joined, there might already be one. Also in case this + // gets called because the cert expired. + server := c.routers.FindServer() + if server != nil { + servers = []string{server.Addr.String()} + } + + if len(servers) == 0 { + return errFn(fmt.Errorf("No servers to request AutoEncrypt.Sign")) + } + + pkPEM, csr, err := c.autoEncryptCSR(extraDNSSANs, extraIPSANs) if err != nil { return errFn(err) } diff --git a/agent/consul/auto_encrypt_test.go b/agent/consul/auto_encrypt_test.go index 85ed253f4e88..fd8725725f21 100644 --- a/agent/consul/auto_encrypt_test.go +++ b/agent/consul/auto_encrypt_test.go @@ -2,11 +2,17 @@ package consul import ( "context" + "crypto/x509" + "crypto/x509/pkix" + "encoding/asn1" "net" + "net/url" "os" "testing" "time" + "github.com/hashicorp/consul/agent/connect" + "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/go-hclog" "github.com/stretchr/testify/require" @@ -98,7 +104,7 @@ func TestAutoEncrypt_RequestAutoEncryptCerts(t *testing.T) { doneCh := make(chan struct{}) var err error go func() { - _, _, err = c1.RequestAutoEncryptCerts(ctx, servers, port, token) + _, _, err = c1.RequestAutoEncryptCerts(ctx, servers, port, token, nil, nil) close(doneCh) }() select { @@ -113,3 +119,87 @@ func TestAutoEncrypt_RequestAutoEncryptCerts(t *testing.T) { // try to request certs. } } + +func TestAutoEncrypt_autoEncryptCSR(t *testing.T) { + type testCase struct { + conf *Config + extraDNSSANs []string + extraIPSANs []net.IP + err string + + // to validate the csr + expectedSubject pkix.Name + expectedSigAlg x509.SignatureAlgorithm + expectedPubAlg x509.PublicKeyAlgorithm + expectedDNSNames []string + expectedIPs []net.IP + expectedURIs []*url.URL + } + + cases := map[string]testCase{ + "sans": { + conf: &Config{ + Datacenter: "dc1", + NodeName: "test-node", + CAConfig: &structs.CAConfiguration{}, + }, + extraDNSSANs: []string{"foo.local", "bar.local"}, + extraIPSANs: []net.IP{net.IPv4(198, 18, 0, 1), net.IPv4(198, 18, 0, 2)}, + expectedSubject: pkix.Name{ + CommonName: connect.AgentCN("test-node", dummyTrustDomain), + Names: []pkix.AttributeTypeAndValue{ + { + // 2,5,4,3 is the CommonName type ASN1 identifier + Type: asn1.ObjectIdentifier{2, 5, 4, 3}, + Value: "testnode.agnt.dummy.tr.consul", + }, + }, + }, + expectedSigAlg: x509.ECDSAWithSHA256, + expectedPubAlg: x509.ECDSA, + expectedDNSNames: []string{ + "localhost", + "foo.local", + "bar.local", + }, + expectedIPs: []net.IP{ + {127, 0, 0, 1}, + net.ParseIP("::1"), + {198, 18, 0, 1}, + {198, 18, 0, 2}, + }, + expectedURIs: []*url.URL{ + { + Scheme: "spiffe", + Host: dummyTrustDomain, + Path: "/agent/client/dc/dc1/id/test-node", + }, + }, + }, + } + + for name, tcase := range cases { + t.Run(name, func(t *testing.T) { + client := Client{config: tcase.conf} + + _, csr, err := client.autoEncryptCSR(tcase.extraDNSSANs, tcase.extraIPSANs) + if tcase.err == "" { + require.NoError(t, err) + + request, err := connect.ParseCSR(csr) + require.NoError(t, err) + require.NotNil(t, request) + + require.Equal(t, tcase.expectedSubject, request.Subject) + require.Equal(t, tcase.expectedSigAlg, request.SignatureAlgorithm) + require.Equal(t, tcase.expectedPubAlg, request.PublicKeyAlgorithm) + require.Equal(t, tcase.expectedDNSNames, request.DNSNames) + require.Equal(t, tcase.expectedIPs, request.IPAddresses) + require.Equal(t, tcase.expectedURIs, request.URIs) + } else { + require.Error(t, err) + require.Empty(t, csr) + } + }) + } +} diff --git a/agent/consul/connect_ca_endpoint.go b/agent/consul/connect_ca_endpoint.go index 81a511ea7382..995cdd3f3dd6 100644 --- a/agent/consul/connect_ca_endpoint.go +++ b/agent/consul/connect_ca_endpoint.go @@ -4,12 +4,11 @@ import ( "context" "errors" "fmt" + "net/url" "reflect" "strings" - "sync" "time" - "github.com/hashicorp/consul/lib/semaphore" "github.com/hashicorp/go-hclog" "golang.org/x/time/rate" @@ -53,57 +52,6 @@ type ConnectCA struct { srv *Server logger hclog.Logger - - // csrRateLimiter limits the rate of signing new certs if configured. Lazily - // initialized from current config to support dynamic changes. - // csrRateLimiterMu must be held while dereferencing the pointer or storing a - // new one, but methods can be called on the limiter object outside of the - // locked section. This is done only in the getCSRRateLimiterWithLimit method. - csrRateLimiter *rate.Limiter - csrRateLimiterMu sync.RWMutex - - // csrConcurrencyLimiter is a dynamically resizable semaphore used to limit - // Sign RPC concurrency if configured. The zero value is usable as soon as - // SetSize is called which we do dynamically in the RPC handler to avoid - // having to hook elaborate synchronization mechanisms through the CA config - // endpoint and config reload etc. - csrConcurrencyLimiter semaphore.Dynamic -} - -// getCSRRateLimiterWithLimit returns a rate.Limiter with the desired limit set. -// It uses the shared server-wide limiter unless the limit has been changed in -// config or the limiter has not been setup yet in which case it just-in-time -// configures the new limiter. We assume that limit changes are relatively rare -// and that all callers (there is currently only one) use the same config value -// as the limit. There might be some flapping if there are multiple concurrent -// requests in flight at the time the config changes where A sees the new value -// and updates, B sees the old but then gets this lock second and changes back. -// Eventually though and very soon (once all current RPCs are complete) we are -// guaranteed to have the correct limit set by the next RPC that comes in so I -// assume this is fine. If we observe strange behavior because of it, we could -// add hysteresis that prevents changes too soon after a previous change but -// that seems unnecessary for now. -func (s *ConnectCA) getCSRRateLimiterWithLimit(limit rate.Limit) *rate.Limiter { - s.csrRateLimiterMu.RLock() - lim := s.csrRateLimiter - s.csrRateLimiterMu.RUnlock() - - // If there is a current limiter with the same limit, return it. This should - // be the common case. - if lim != nil && lim.Limit() == limit { - return lim - } - - // Need to change limiter, get write lock - s.csrRateLimiterMu.Lock() - defer s.csrRateLimiterMu.Unlock() - // No limiter yet, or limit changed in CA config, reconfigure a new limiter. - // We use burst of 1 for a hard limit. Note that either bursting or waiting is - // necessary to get expected behavior in fact of random arrival times, but we - // don't need both and we use Wait with a small delay to smooth noise. See - // https://github.com/banks/sim-rate-limit-backoff/blob/master/README.md. - s.csrRateLimiter = rate.NewLimiter(limit, 1) - return s.csrRateLimiter } // ConfigurationGet returns the configuration for the CA. @@ -480,6 +428,30 @@ func (s *ConnectCA) Sign( return fmt.Errorf("SPIFFE ID in CSR from a different trust domain: %s, "+ "we are %s", serviceID.Host, signingID.Host()) } + } else { + // isAgent - if we support more ID types then this would need to be an else if + // here we are just automatically fixing the trust domain. For auto-encrypt and + // auto-config they make certificate requests before learning about the roots + // so they will have a dummy trust domain in the CSR. + trustDomain := signingID.Host() + if agentID.Host != trustDomain { + originalURI := agentID.URI() + + agentID.Host = trustDomain + csr.Subject.CommonName = connect.AgentCN(agentID.Agent, trustDomain) + + // recreate the URIs list + uris := make([]*url.URL, len(csr.URIs)) + for i, uri := range csr.URIs { + if originalURI.String() == uri.String() { + uris[i] = agentID.URI() + } else { + uris[i] = uri + } + } + + csr.URIs = uris + } } // Verify that the ACL token provided has permission to act as this service @@ -514,7 +486,7 @@ func (s *ConnectCA) Sign( return err } if commonCfg.CSRMaxPerSecond > 0 { - lim := s.getCSRRateLimiterWithLimit(rate.Limit(commonCfg.CSRMaxPerSecond)) + lim := s.srv.caLeafLimiter.getCSRRateLimiterWithLimit(rate.Limit(commonCfg.CSRMaxPerSecond)) // Wait up to the small threshold we allow for a token. ctx, cancel := context.WithTimeout(context.Background(), csrLimitWait) defer cancel() @@ -522,13 +494,13 @@ func (s *ConnectCA) Sign( return ErrRateLimited } } else if commonCfg.CSRMaxConcurrent > 0 { - s.csrConcurrencyLimiter.SetSize(int64(commonCfg.CSRMaxConcurrent)) + s.srv.caLeafLimiter.csrConcurrencyLimiter.SetSize(int64(commonCfg.CSRMaxConcurrent)) ctx, cancel := context.WithTimeout(context.Background(), csrLimitWait) defer cancel() - if err := s.csrConcurrencyLimiter.Acquire(ctx); err != nil { + if err := s.srv.caLeafLimiter.csrConcurrencyLimiter.Acquire(ctx); err != nil { return ErrRateLimited } - defer s.csrConcurrencyLimiter.Release() + defer s.srv.caLeafLimiter.csrConcurrencyLimiter.Release() } // All seems to be in order, actually sign it. diff --git a/agent/consul/server.go b/agent/consul/server.go index 33c5a37d28c6..d95da2718119 100644 --- a/agent/consul/server.go +++ b/agent/consul/server.go @@ -153,6 +153,9 @@ type Server struct { caProviderRoot *structs.CARoot caProviderLock sync.RWMutex + // rate limiter to use when signing leaf certificates + caLeafLimiter connectSignRateLimiter + // Consul configuration config *Config diff --git a/agent/consul/server_connect.go b/agent/consul/server_connect.go new file mode 100644 index 000000000000..2dc2bed2d1fd --- /dev/null +++ b/agent/consul/server_connect.go @@ -0,0 +1,61 @@ +package consul + +import ( + "sync" + + "github.com/hashicorp/consul/lib/semaphore" + "golang.org/x/time/rate" +) + +type connectSignRateLimiter struct { + // csrRateLimiter limits the rate of signing new certs if configured. Lazily + // initialized from current config to support dynamic changes. + // csrRateLimiterMu must be held while dereferencing the pointer or storing a + // new one, but methods can be called on the limiter object outside of the + // locked section. This is done only in the getCSRRateLimiterWithLimit method. + csrRateLimiter *rate.Limiter + csrRateLimiterMu sync.RWMutex + + // csrConcurrencyLimiter is a dynamically resizable semaphore used to limit + // Sign RPC concurrency if configured. The zero value is usable as soon as + // SetSize is called which we do dynamically in the RPC handler to avoid + // having to hook elaborate synchronization mechanisms through the CA config + // endpoint and config reload etc. + csrConcurrencyLimiter semaphore.Dynamic +} + +// getCSRRateLimiterWithLimit returns a rate.Limiter with the desired limit set. +// It uses the shared server-wide limiter unless the limit has been changed in +// config or the limiter has not been setup yet in which case it just-in-time +// configures the new limiter. We assume that limit changes are relatively rare +// and that all callers (there is currently only one) use the same config value +// as the limit. There might be some flapping if there are multiple concurrent +// requests in flight at the time the config changes where A sees the new value +// and updates, B sees the old but then gets this lock second and changes back. +// Eventually though and very soon (once all current RPCs are complete) we are +// guaranteed to have the correct limit set by the next RPC that comes in so I +// assume this is fine. If we observe strange behavior because of it, we could +// add hysteresis that prevents changes too soon after a previous change but +// that seems unnecessary for now. +func (l *connectSignRateLimiter) getCSRRateLimiterWithLimit(limit rate.Limit) *rate.Limiter { + l.csrRateLimiterMu.RLock() + lim := l.csrRateLimiter + l.csrRateLimiterMu.RUnlock() + + // If there is a current limiter with the same limit, return it. This should + // be the common case. + if lim != nil && lim.Limit() == limit { + return lim + } + + // Need to change limiter, get write lock + l.csrRateLimiterMu.Lock() + defer l.csrRateLimiterMu.Unlock() + // No limiter yet, or limit changed in CA config, reconfigure a new limiter. + // We use burst of 1 for a hard limit. Note that either bursting or waiting is + // necessary to get expected behavior in fact of random arrival times, but we + // don't need both and we use Wait with a small delay to smooth noise. See + // https://github.com/banks/sim-rate-limit-backoff/blob/master/README.md. + l.csrRateLimiter = rate.NewLimiter(limit, 1) + return l.csrRateLimiter +} diff --git a/tlsutil/config.go b/tlsutil/config.go index 240bb1f15aee..5d782d42ad30 100644 --- a/tlsutil/config.go +++ b/tlsutil/config.go @@ -505,6 +505,12 @@ func (c *Configurator) commonTLSConfig(verifyIncoming bool) *tls.Config { cert = c.manual.cert } + if cert == nil { + // the return value MUST not be nil but an empty certificate will be + // treated the same as having no client certificate + cert = &tls.Certificate{} + } + return cert, nil } diff --git a/tlsutil/config_test.go b/tlsutil/config_test.go index 91bd99de82fa..917cd2d77926 100644 --- a/tlsutil/config_test.go +++ b/tlsutil/config_test.go @@ -649,7 +649,8 @@ func TestConfigurator_CommonTLSConfigGetClientCertificate(t *testing.T) { cert, err := c.commonTLSConfig(false).GetClientCertificate(nil) require.NoError(t, err) - require.Nil(t, cert) + require.NotNil(t, cert) + require.Empty(t, cert.Certificate) c1, err := loadKeyPair("../test/key/something_expired.cer", "../test/key/something_expired.key") require.NoError(t, err)