diff --git a/CHANGELOG.md b/CHANGELOG.md index 873d49b793..d977798e5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ BREAKING CHANGES: * Control Plane * Update minimum go version for project to 1.17 [[GH-878](https://github.com/hashicorp/consul-k8s/pull/878)] + * Add boolean metric to merged metrics response `consul_merged_service_metrics_success` to indicate if service metrics were + scraped successfully. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)] IMPROVEMENTS: * CLI @@ -16,6 +18,8 @@ IMPROVEMENTS: BUG FIXES: * Control Plane: * Add a workaround to check that the ACL token is replicated to other Consul servers. [[GH-862](https://github.com/hashicorp/consul-k8s/issues/862)] + * Return 500 on prometheus response if unable to get metrics from Envoy. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)] + * Don't include body of failed service metrics calls in merged metrics response. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)] * Helm Chart * Admin Partitions **(Consul Enterprise only)**: Do not mount Consul CA certs to partition-init job if `externalServers.useSystemRoots` is `true`. [[GH-885](https://github.com/hashicorp/consul-k8s/pull/885)] diff --git a/control-plane/subcommand/consul-sidecar/command.go b/control-plane/subcommand/consul-sidecar/command.go index dcb05199ba..b6ff0d9d39 100644 --- a/control-plane/subcommand/consul-sidecar/command.go +++ b/control-plane/subcommand/consul-sidecar/command.go @@ -21,8 +21,13 @@ import ( "github.com/mitchellh/cli" ) -const metricsServerShutdownTimeout = 5 * time.Second -const envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus" +const ( + metricsServerShutdownTimeout = 5 * time.Second + envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus" + // prometheusServiceMetricsSuccessKey is the key of the prometheus metric used to + // indicate if service metrics were scraped successfully. + prometheusServiceMetricsSuccessKey = "consul_merged_service_metrics_success" +) type Command struct { UI cli.Ui @@ -240,27 +245,36 @@ func (c *Command) createMergedMetricsServer() *http.Server { mergedMetricsServerAddr := fmt.Sprintf("127.0.0.1:%s", c.flagMergedMetricsPort) server := &http.Server{Addr: mergedMetricsServerAddr, Handler: mux} + // http.Client satisfies the metricsGetter interface. // The default http.Client timeout is indefinite, so adding a timeout makes // sure that requests don't hang. client := &http.Client{ Timeout: time.Second * 10, } - // http.Client satisfies the metricsGetter interface. - c.envoyMetricsGetter = client - c.serviceMetricsGetter = client + + // During tests these may already be set to mocks. + if c.envoyMetricsGetter == nil { + c.envoyMetricsGetter = client + } + if c.serviceMetricsGetter == nil { + c.serviceMetricsGetter = client + } return server } // mergedMetricsHandler has the logic to append both Envoy and service metrics // together, logging if it's unsuccessful at either. +// If the Envoy scrape fails, we respond with a 500 code which follows the Prometheus +// exporter guidelines. If the service scrape fails, we respond with a 200 so +// that the Envoy metrics are still scraped. +// We also include a metric line in each response indicating the success or +// failure of the service metric scraping. func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) { - envoyMetrics, err := c.envoyMetricsGetter.Get(envoyMetricsAddr) if err != nil { - // If there is an error scraping Envoy, we want the handler to return - // without writing anything to the response, and log the error. - c.logger.Error(fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err.Error())) + c.logger.Error("Error scraping Envoy proxy metrics", "err", err) + http.Error(rw, fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err), http.StatusInternalServerError) return } @@ -273,18 +287,22 @@ func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) }() envoyMetricsBody, err := ioutil.ReadAll(envoyMetrics.Body) if err != nil { - c.logger.Error(fmt.Sprintf("Couldn't read Envoy proxy metrics: %s", err.Error())) + c.logger.Error("Could not read Envoy proxy metrics", "err", err) + http.Error(rw, fmt.Sprintf("Could not read Envoy proxy metrics: %s", err), http.StatusInternalServerError) return } - _, err = rw.Write(envoyMetricsBody) - if err != nil { - c.logger.Error(fmt.Sprintf("Error writing envoy metrics body: %s", err.Error())) + if non2xxCode(envoyMetrics.StatusCode) { + c.logger.Error("Received non-2xx status code scraping Envoy proxy metrics", "code", envoyMetrics.StatusCode, "response", string(envoyMetricsBody)) + http.Error(rw, fmt.Sprintf("Received non-2xx status code scraping Envoy proxy metrics: %d: %s", envoyMetrics.StatusCode, string(envoyMetricsBody)), http.StatusInternalServerError) + return } + writeResponse(rw, envoyMetricsBody, "envoy metrics", c.logger) serviceMetricsAddr := fmt.Sprintf("http://127.0.0.1:%s%s", c.flagServiceMetricsPort, c.flagServiceMetricsPath) serviceMetrics, err := c.serviceMetricsGetter.Get(serviceMetricsAddr) if err != nil { - c.logger.Warn(fmt.Sprintf("Error scraping service metrics: %s", err.Error())) + c.logger.Warn("Error scraping service metrics", "err", err) + writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger) // Since we've already written the Envoy metrics to the response, we can // return at this point if we were unable to get service metrics. return @@ -300,12 +318,25 @@ func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) }() serviceMetricsBody, err := ioutil.ReadAll(serviceMetrics.Body) if err != nil { - c.logger.Error(fmt.Sprintf("Couldn't read service metrics: %s", err.Error())) + c.logger.Error("Could not read service metrics", "err", err) + writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger) return } - _, err = rw.Write(serviceMetricsBody) + if non2xxCode(serviceMetrics.StatusCode) { + c.logger.Error("Received non-2xx status code scraping service metrics", "code", serviceMetrics.StatusCode, "response", string(serviceMetricsBody)) + writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger) + return + } + writeResponse(rw, serviceMetricsBody, "service metrics", c.logger) + writeResponse(rw, serviceMetricSuccess(true), "service metrics success", c.logger) +} + +// writeResponse is a helper method to write resp to rw and log if there is an error writing. +// respName is the name of this response that will be used in the error log. +func writeResponse(rw http.ResponseWriter, resp []byte, respName string, logger hclog.Logger) { + _, err := rw.Write(resp) if err != nil { - c.logger.Error(fmt.Sprintf("Error writing service metrics body: %s", err.Error())) + logger.Error(fmt.Sprintf("Error writing %s: %s", respName, err.Error())) } } @@ -339,6 +370,21 @@ func (c *Command) validateFlags() error { return nil } +// non2xxCode returns true if code is not in the range of 200-299 inclusive. +func non2xxCode(code int) bool { + return code < 200 || code >= 300 +} + +// serviceMetricSuccess returns a prometheus metric line indicating +// the success of the metrics merging. +func serviceMetricSuccess(success bool) []byte { + boolAsInt := 0 + if success { + boolAsInt = 1 + } + return []byte(fmt.Sprintf("%s %d\n", prometheusServiceMetricsSuccessKey, boolAsInt)) +} + // parseConsulFlags creates Consul client command flags // from command's HTTP flags and returns them as an array of strings. func (c *Command) parseConsulFlags() []string { diff --git a/control-plane/subcommand/consul-sidecar/command_test.go b/control-plane/subcommand/consul-sidecar/command_test.go index 6ee55f8ea0..370b4afd8f 100644 --- a/control-plane/subcommand/consul-sidecar/command_test.go +++ b/control-plane/subcommand/consul-sidecar/command_test.go @@ -214,50 +214,88 @@ func TestRunSignalHandlingAllProcessesEnabled(t *testing.T) { } } -type envoyMetrics struct { +type mockEnvoyMetricsGetter struct { + respStatusCode int } -func (em *envoyMetrics) Get(url string) (resp *http.Response, err error) { +func (em *mockEnvoyMetricsGetter) Get(_ string) (resp *http.Response, err error) { response := &http.Response{} + response.StatusCode = em.respStatusCode response.Body = ioutil.NopCloser(bytes.NewReader([]byte("envoy metrics\n"))) return response, nil } -type serviceMetrics struct { - url string +// mockServiceMetricsGetter +type mockServiceMetricsGetter struct { + // reqURL is the last URL that was passed to Get(url) + reqURL string + + // respStatusCode is the status code to use for the response. + respStatusCode int } -func (sm *serviceMetrics) Get(url string) (resp *http.Response, err error) { +func (sm *mockServiceMetricsGetter) Get(url string) (resp *http.Response, err error) { + // Record the URL that we were called with. + sm.reqURL = url + response := &http.Response{} response.Body = ioutil.NopCloser(bytes.NewReader([]byte("service metrics\n"))) - sm.url = url + response.StatusCode = sm.respStatusCode + return response, nil } func TestMergedMetricsServer(t *testing.T) { cases := []struct { - name string - runEnvoyMetricsServer bool - runServiceMetricsServer bool - expectedOutput string + name string + envoyMetricsGetter *mockEnvoyMetricsGetter + serviceMetricsGetter *mockServiceMetricsGetter + expectedStatusCode int + expectedOutput string }{ { - name: "happy path: envoy and service metrics are merged", - runEnvoyMetricsServer: true, - runServiceMetricsServer: true, - expectedOutput: "envoy metrics\nservice metrics\n", + name: "happy path: envoy and service metrics are merged", + envoyMetricsGetter: &mockEnvoyMetricsGetter{ + respStatusCode: 200, + }, + serviceMetricsGetter: &mockServiceMetricsGetter{ + respStatusCode: 200, + }, + expectedStatusCode: 200, + expectedOutput: "envoy metrics\nservice metrics\nconsul_merged_service_metrics_success 1\n", }, { - name: "no service metrics", - runEnvoyMetricsServer: true, - runServiceMetricsServer: false, - expectedOutput: "envoy metrics\n", + name: "service metrics non-200", + envoyMetricsGetter: &mockEnvoyMetricsGetter{ + respStatusCode: 200, + }, + serviceMetricsGetter: &mockServiceMetricsGetter{ + respStatusCode: 404, + }, + expectedStatusCode: 200, + expectedOutput: "envoy metrics\nconsul_merged_service_metrics_success 0\n", }, { - name: "no envoy metrics", - runEnvoyMetricsServer: false, - runServiceMetricsServer: true, - expectedOutput: "", + name: "envoy metrics non-200", + envoyMetricsGetter: &mockEnvoyMetricsGetter{ + respStatusCode: 404, + }, + serviceMetricsGetter: &mockServiceMetricsGetter{ + respStatusCode: 200, + }, + expectedStatusCode: 500, + expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 404: envoy metrics\n\n", + }, + { + name: "envoy and service metrics non-200", + envoyMetricsGetter: &mockEnvoyMetricsGetter{ + respStatusCode: 500, + }, + serviceMetricsGetter: &mockServiceMetricsGetter{ + respStatusCode: 500, + }, + expectedStatusCode: 500, + expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 500: envoy metrics\n\n", }, } @@ -272,21 +310,11 @@ func TestMergedMetricsServer(t *testing.T) { flagServiceMetricsPort: fmt.Sprint(randomPorts[1]), flagServiceMetricsPath: "/metrics", logger: hclog.Default(), + envoyMetricsGetter: c.envoyMetricsGetter, + serviceMetricsGetter: c.serviceMetricsGetter, } server := cmd.createMergedMetricsServer() - - // Override the cmd's envoyMetricsGetter and serviceMetricsGetter - // with stubs. - em := &envoyMetrics{} - sm := &serviceMetrics{} - if c.runEnvoyMetricsServer { - cmd.envoyMetricsGetter = em - } - if c.runServiceMetricsServer { - cmd.serviceMetricsGetter = sm - } - go func() { _ = server.ListenAndServe() }() @@ -304,8 +332,8 @@ func TestMergedMetricsServer(t *testing.T) { // Verify the correct service metrics url was used. The service // metrics endpoint is only called if the Envoy metrics endpoint // call succeeds. - if c.runServiceMetricsServer && c.runEnvoyMetricsServer { - require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), sm.url) + if c.envoyMetricsGetter.respStatusCode == 200 { + require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), c.serviceMetricsGetter.reqURL) } }) })