Skip to content

Commit

Permalink
Change error handling when scraping metrics (#551)
Browse files Browse the repository at this point in the history
* If Envoy returns an error then also respond with a 500 in our merged
metrics response so that Prometheus will know that we had an error, not
that there are no metrics.
* If the service metrics return with a non-2xx status code then don't
include the response body in the merged metrics. This will stop issues
where users accidentally turn on metrics merging but they don't have an
exporter and so their metrics endpoint returns 404. I could have
responded with a 500 in this case in order to indicate that there is an
error, however I think it's more likely that users are accidentally
turning on metrics merging and the error indication is accomplished via
a new metric (see below).
* Append a new metric that indicates the success of the service
scraping. This can be used for alerting by users since the response code
of the service metrics response is discarded:
  * success: consul_metrics_merging_service_metrics_success 1
  * fail: consul_metrics_merging_service_metrics_success 0
* modify logging to use key/value pairs
* Fixes #546
  • Loading branch information
lkysow authored Dec 6, 2021
1 parent e0f3b71 commit d08821b
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 53 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
BREAKING CHANGES:
* Control Plane
* Update minimum go version for project to 1.17 [[GH-878](https://github.com/hashicorp/consul-k8s/pull/878)]
* Add boolean metric to merged metrics response `consul_merged_service_metrics_success` to indicate if service metrics were
scraped successfully. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)]

IMPROVEMENTS:
* CLI
Expand All @@ -16,6 +18,8 @@ IMPROVEMENTS:
BUG FIXES:
* Control Plane:
* Add a workaround to check that the ACL token is replicated to other Consul servers. [[GH-862](https://github.com/hashicorp/consul-k8s/issues/862)]
* Return 500 on prometheus response if unable to get metrics from Envoy. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)]
* Don't include body of failed service metrics calls in merged metrics response. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)]
* Helm Chart
* Admin Partitions **(Consul Enterprise only)**: Do not mount Consul CA certs to partition-init job if `externalServers.useSystemRoots` is `true`. [[GH-885](https://github.com/hashicorp/consul-k8s/pull/885)]

Expand Down
80 changes: 63 additions & 17 deletions control-plane/subcommand/consul-sidecar/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ import (
"github.com/mitchellh/cli"
)

const metricsServerShutdownTimeout = 5 * time.Second
const envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus"
const (
metricsServerShutdownTimeout = 5 * time.Second
envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus"
// prometheusServiceMetricsSuccessKey is the key of the prometheus metric used to
// indicate if service metrics were scraped successfully.
prometheusServiceMetricsSuccessKey = "consul_merged_service_metrics_success"
)

type Command struct {
UI cli.Ui
Expand Down Expand Up @@ -240,27 +245,36 @@ func (c *Command) createMergedMetricsServer() *http.Server {
mergedMetricsServerAddr := fmt.Sprintf("127.0.0.1:%s", c.flagMergedMetricsPort)
server := &http.Server{Addr: mergedMetricsServerAddr, Handler: mux}

// http.Client satisfies the metricsGetter interface.
// The default http.Client timeout is indefinite, so adding a timeout makes
// sure that requests don't hang.
client := &http.Client{
Timeout: time.Second * 10,
}
// http.Client satisfies the metricsGetter interface.
c.envoyMetricsGetter = client
c.serviceMetricsGetter = client

// During tests these may already be set to mocks.
if c.envoyMetricsGetter == nil {
c.envoyMetricsGetter = client
}
if c.serviceMetricsGetter == nil {
c.serviceMetricsGetter = client
}

return server
}

// mergedMetricsHandler has the logic to append both Envoy and service metrics
// together, logging if it's unsuccessful at either.
// If the Envoy scrape fails, we respond with a 500 code which follows the Prometheus
// exporter guidelines. If the service scrape fails, we respond with a 200 so
// that the Envoy metrics are still scraped.
// We also include a metric line in each response indicating the success or
// failure of the service metric scraping.
func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) {

envoyMetrics, err := c.envoyMetricsGetter.Get(envoyMetricsAddr)
if err != nil {
// If there is an error scraping Envoy, we want the handler to return
// without writing anything to the response, and log the error.
c.logger.Error(fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err.Error()))
c.logger.Error("Error scraping Envoy proxy metrics", "err", err)
http.Error(rw, fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err), http.StatusInternalServerError)
return
}

Expand All @@ -273,18 +287,22 @@ func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request)
}()
envoyMetricsBody, err := ioutil.ReadAll(envoyMetrics.Body)
if err != nil {
c.logger.Error(fmt.Sprintf("Couldn't read Envoy proxy metrics: %s", err.Error()))
c.logger.Error("Could not read Envoy proxy metrics", "err", err)
http.Error(rw, fmt.Sprintf("Could not read Envoy proxy metrics: %s", err), http.StatusInternalServerError)
return
}
_, err = rw.Write(envoyMetricsBody)
if err != nil {
c.logger.Error(fmt.Sprintf("Error writing envoy metrics body: %s", err.Error()))
if non2xxCode(envoyMetrics.StatusCode) {
c.logger.Error("Received non-2xx status code scraping Envoy proxy metrics", "code", envoyMetrics.StatusCode, "response", string(envoyMetricsBody))
http.Error(rw, fmt.Sprintf("Received non-2xx status code scraping Envoy proxy metrics: %d: %s", envoyMetrics.StatusCode, string(envoyMetricsBody)), http.StatusInternalServerError)
return
}
writeResponse(rw, envoyMetricsBody, "envoy metrics", c.logger)

serviceMetricsAddr := fmt.Sprintf("http://127.0.0.1:%s%s", c.flagServiceMetricsPort, c.flagServiceMetricsPath)
serviceMetrics, err := c.serviceMetricsGetter.Get(serviceMetricsAddr)
if err != nil {
c.logger.Warn(fmt.Sprintf("Error scraping service metrics: %s", err.Error()))
c.logger.Warn("Error scraping service metrics", "err", err)
writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger)
// Since we've already written the Envoy metrics to the response, we can
// return at this point if we were unable to get service metrics.
return
Expand All @@ -300,12 +318,25 @@ func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request)
}()
serviceMetricsBody, err := ioutil.ReadAll(serviceMetrics.Body)
if err != nil {
c.logger.Error(fmt.Sprintf("Couldn't read service metrics: %s", err.Error()))
c.logger.Error("Could not read service metrics", "err", err)
writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger)
return
}
_, err = rw.Write(serviceMetricsBody)
if non2xxCode(serviceMetrics.StatusCode) {
c.logger.Error("Received non-2xx status code scraping service metrics", "code", serviceMetrics.StatusCode, "response", string(serviceMetricsBody))
writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger)
return
}
writeResponse(rw, serviceMetricsBody, "service metrics", c.logger)
writeResponse(rw, serviceMetricSuccess(true), "service metrics success", c.logger)
}

// writeResponse is a helper method to write resp to rw and log if there is an error writing.
// respName is the name of this response that will be used in the error log.
func writeResponse(rw http.ResponseWriter, resp []byte, respName string, logger hclog.Logger) {
_, err := rw.Write(resp)
if err != nil {
c.logger.Error(fmt.Sprintf("Error writing service metrics body: %s", err.Error()))
logger.Error(fmt.Sprintf("Error writing %s: %s", respName, err.Error()))
}
}

Expand Down Expand Up @@ -339,6 +370,21 @@ func (c *Command) validateFlags() error {
return nil
}

// non2xxCode returns true if code is not in the range of 200-299 inclusive.
func non2xxCode(code int) bool {
return code < 200 || code >= 300
}

// serviceMetricSuccess returns a prometheus metric line indicating
// the success of the metrics merging.
func serviceMetricSuccess(success bool) []byte {
boolAsInt := 0
if success {
boolAsInt = 1
}
return []byte(fmt.Sprintf("%s %d\n", prometheusServiceMetricsSuccessKey, boolAsInt))
}

// parseConsulFlags creates Consul client command flags
// from command's HTTP flags and returns them as an array of strings.
func (c *Command) parseConsulFlags() []string {
Expand Down
100 changes: 64 additions & 36 deletions control-plane/subcommand/consul-sidecar/command_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,50 +214,88 @@ func TestRunSignalHandlingAllProcessesEnabled(t *testing.T) {
}
}

type envoyMetrics struct {
type mockEnvoyMetricsGetter struct {
respStatusCode int
}

func (em *envoyMetrics) Get(url string) (resp *http.Response, err error) {
func (em *mockEnvoyMetricsGetter) Get(_ string) (resp *http.Response, err error) {
response := &http.Response{}
response.StatusCode = em.respStatusCode
response.Body = ioutil.NopCloser(bytes.NewReader([]byte("envoy metrics\n")))
return response, nil
}

type serviceMetrics struct {
url string
// mockServiceMetricsGetter
type mockServiceMetricsGetter struct {
// reqURL is the last URL that was passed to Get(url)
reqURL string

// respStatusCode is the status code to use for the response.
respStatusCode int
}

func (sm *serviceMetrics) Get(url string) (resp *http.Response, err error) {
func (sm *mockServiceMetricsGetter) Get(url string) (resp *http.Response, err error) {
// Record the URL that we were called with.
sm.reqURL = url

response := &http.Response{}
response.Body = ioutil.NopCloser(bytes.NewReader([]byte("service metrics\n")))
sm.url = url
response.StatusCode = sm.respStatusCode

return response, nil
}

func TestMergedMetricsServer(t *testing.T) {
cases := []struct {
name string
runEnvoyMetricsServer bool
runServiceMetricsServer bool
expectedOutput string
name string
envoyMetricsGetter *mockEnvoyMetricsGetter
serviceMetricsGetter *mockServiceMetricsGetter
expectedStatusCode int
expectedOutput string
}{
{
name: "happy path: envoy and service metrics are merged",
runEnvoyMetricsServer: true,
runServiceMetricsServer: true,
expectedOutput: "envoy metrics\nservice metrics\n",
name: "happy path: envoy and service metrics are merged",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 200,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 200,
},
expectedStatusCode: 200,
expectedOutput: "envoy metrics\nservice metrics\nconsul_merged_service_metrics_success 1\n",
},
{
name: "no service metrics",
runEnvoyMetricsServer: true,
runServiceMetricsServer: false,
expectedOutput: "envoy metrics\n",
name: "service metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 200,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 404,
},
expectedStatusCode: 200,
expectedOutput: "envoy metrics\nconsul_merged_service_metrics_success 0\n",
},
{
name: "no envoy metrics",
runEnvoyMetricsServer: false,
runServiceMetricsServer: true,
expectedOutput: "",
name: "envoy metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 404,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 200,
},
expectedStatusCode: 500,
expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 404: envoy metrics\n\n",
},
{
name: "envoy and service metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 500,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 500,
},
expectedStatusCode: 500,
expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 500: envoy metrics\n\n",
},
}

Expand All @@ -272,21 +310,11 @@ func TestMergedMetricsServer(t *testing.T) {
flagServiceMetricsPort: fmt.Sprint(randomPorts[1]),
flagServiceMetricsPath: "/metrics",
logger: hclog.Default(),
envoyMetricsGetter: c.envoyMetricsGetter,
serviceMetricsGetter: c.serviceMetricsGetter,
}

server := cmd.createMergedMetricsServer()

// Override the cmd's envoyMetricsGetter and serviceMetricsGetter
// with stubs.
em := &envoyMetrics{}
sm := &serviceMetrics{}
if c.runEnvoyMetricsServer {
cmd.envoyMetricsGetter = em
}
if c.runServiceMetricsServer {
cmd.serviceMetricsGetter = sm
}

go func() {
_ = server.ListenAndServe()
}()
Expand All @@ -304,8 +332,8 @@ func TestMergedMetricsServer(t *testing.T) {
// Verify the correct service metrics url was used. The service
// metrics endpoint is only called if the Envoy metrics endpoint
// call succeeds.
if c.runServiceMetricsServer && c.runEnvoyMetricsServer {
require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), sm.url)
if c.envoyMetricsGetter.respStatusCode == 200 {
require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), c.serviceMetricsGetter.reqURL)
}
})
})
Expand Down

0 comments on commit d08821b

Please sign in to comment.