Skip to content

Commit

Permalink
Change error handling when scraping metrics
Browse files Browse the repository at this point in the history
* If Envoy returns an error then also respond with a 500 in our merged
metrics response so that Prometheus will know that we had an error, not
that there are no metrics.
* If the service metrics return with a non-2xx status code then don't
include the response body in the merged metrics. This will stop issues
where users accidentally turn on metrics merging but they don't have an
exporter and so their metrics endpoint returns 404. I could have
responded with a 500 in this case in order to indicate that there is an
error, however I think it's more likely that users are accidentally
turning on metrics merging and the error indication is accomplished via
a new metric (see below).
* Append a new metric that indicates the success of the service
scraping. This can be used for alerting by users since the response code
of the service metrics response is discarded:
  * success: consul_metrics_merging_service_metrics_success 1
  * fail: consul_metrics_merging_service_metrics_success 0
* modify logging to use key/value pairs
* Fixes #546
  • Loading branch information
lkysow committed Jul 5, 2021
1 parent 7d7ce5d commit a109f30
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 51 deletions.
70 changes: 55 additions & 15 deletions subcommand/consul-sidecar/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ import (
"github.com/mitchellh/cli"
)

const metricsServerShutdownTimeout = 5 * time.Second
const envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus"
const (
metricsServerShutdownTimeout = 5 * time.Second
envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus"
// prometheusServiceMetricsSuccessKey is the key of the prometheus metrics used to
// indicate if service metrics were scraped successfully.
prometheusServiceMetricsSuccessKey = "consul_metrics_merging_service_metrics_success"
)

type Command struct {
UI cli.Ui
Expand Down Expand Up @@ -243,57 +248,77 @@ func (c *Command) createMergedMetricsServer() *http.Server {
mergedMetricsServerAddr := fmt.Sprintf("127.0.0.1:%s", c.flagMergedMetricsPort)
server := &http.Server{Addr: mergedMetricsServerAddr, Handler: mux}

// http.Client satisfies the metricsGetter interface.
// The default http.Client timeout is indefinite, so adding a timeout makes
// sure that requests don't hang.
client := &http.Client{
Timeout: time.Second * 10,
}
// http.Client satisfies the metricsGetter interface.
c.envoyMetricsGetter = client
c.serviceMetricsGetter = client

// During tests these may already be set to mocks.
if c.envoyMetricsGetter == nil {
c.envoyMetricsGetter = client
}
if c.serviceMetricsGetter == nil {
c.serviceMetricsGetter = client
}

return server
}

// mergedMetricsHandler has the logic to append both Envoy and service metrics
// together, logging if it's unsuccessful at either.
// If the Envoy scrape fails, we respond with a 500 code which follows the Prometheus
// exporter guidelines. If the service scrape fails, we respond with a 200 so
// that the Envoy metrics are still scraped.
// We also include a metric line in each response indicating the success or
// failure of the service metric scraping.
func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) {

envoyMetrics, err := c.envoyMetricsGetter.Get(envoyMetricsAddr)
if err != nil {
// If there is an error scraping Envoy, we want the handler to return
// without writing anything to the response, and log the error.
c.logger.Error(fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err.Error()))
c.logger.Error("Error scraping Envoy proxy metrics", "err", err)
http.Error(rw, fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err), http.StatusInternalServerError)
return
}

// Write Envoy metrics to the response.
defer envoyMetrics.Body.Close()
envoyMetricsBody, err := ioutil.ReadAll(envoyMetrics.Body)
if err != nil {
c.logger.Error(fmt.Sprintf("Couldn't read Envoy proxy metrics: %s", err.Error()))
c.logger.Error("Could not read Envoy proxy metrics", "err", err)
http.Error(rw, fmt.Sprintf("Could not read Envoy proxy metrics: %s", err), http.StatusInternalServerError)
return
}
if non2xxCode(envoyMetrics.StatusCode) {
c.logger.Error("Received non-2xx status code scraping Envoy proxy metrics", "code", envoyMetrics.StatusCode, "response", string(envoyMetricsBody))
http.Error(rw, fmt.Sprintf("Received non-2xx status code scraping Envoy proxy metrics: %d: %s", envoyMetrics.StatusCode, string(envoyMetricsBody)), http.StatusInternalServerError)
return
}
rw.Write(envoyMetricsBody)

serviceMetricsAddr := fmt.Sprintf("http://127.0.0.1:%s%s", c.flagServiceMetricsPort, c.flagServiceMetricsPath)
serviceMetrics, err := c.serviceMetricsGetter.Get(serviceMetricsAddr)
if err != nil {
c.logger.Warn(fmt.Sprintf("Error scraping service metrics: %s", err.Error()))
c.logger.Warn("Error scraping service metrics", "err", err)
rw.Write(serviceMetricSuccess(false))
// Since we've already written the Envoy metrics to the response, we can
// return at this point if we were unable to get service metrics.
return
}

// Since serviceMetrics will be non-nil if there are no errors, write the
// service metrics to the response as well.
defer serviceMetrics.Body.Close()
serviceMetricsBody, err := ioutil.ReadAll(serviceMetrics.Body)
if err != nil {
c.logger.Error(fmt.Sprintf("Couldn't read service metrics: %s", err.Error()))
c.logger.Error("Could not read service metrics", "err", err)
rw.Write(serviceMetricSuccess(false))
return
}
if non2xxCode(serviceMetrics.StatusCode) {
c.logger.Error("Received non-2xx status code scraping service metrics", "code", serviceMetrics.StatusCode, "response", string(serviceMetricsBody))
rw.Write(serviceMetricSuccess(false))
return
}
rw.Write(serviceMetricsBody)
rw.Write(serviceMetricSuccess(true))
}

// validateFlags validates the flags.
Expand Down Expand Up @@ -326,6 +351,21 @@ func (c *Command) validateFlags() error {
return nil
}

// non2xxCode returns true if code is not in the range of 200-299 inclusive.
func non2xxCode(code int) bool {
return code < 200 || code >= 300
}

// serviceMetricSuccess returns a prometheus metric line indicating
// the success of the metrics merging.
func serviceMetricSuccess(success bool) []byte {
boolAsInt := 0
if success {
boolAsInt = 1
}
return []byte(fmt.Sprintf("%s %d\n", prometheusServiceMetricsSuccessKey, boolAsInt))
}

// parseConsulFlags creates Consul client command flags
// from command's HTTP flags and returns them as an array of strings.
func (c *Command) parseConsulFlags() []string {
Expand Down
100 changes: 64 additions & 36 deletions subcommand/consul-sidecar/command_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,50 +214,88 @@ func TestRunSignalHandlingAllProcessesEnabled(t *testing.T) {
}
}

type envoyMetrics struct {
type mockEnvoyMetricsGetter struct {
respStatusCode int
}

func (em *envoyMetrics) Get(url string) (resp *http.Response, err error) {
func (em *mockEnvoyMetricsGetter) Get(_ string) (resp *http.Response, err error) {
response := &http.Response{}
response.StatusCode = em.respStatusCode
response.Body = ioutil.NopCloser(bytes.NewReader([]byte("envoy metrics\n")))
return response, nil
}

type serviceMetrics struct {
url string
// mockServiceMetricsGetter
type mockServiceMetricsGetter struct {
// reqURL is the last URL that was passed to Get(url)
reqURL string

// respStatusCode is the status code to use for the response.
respStatusCode int
}

func (sm *serviceMetrics) Get(url string) (resp *http.Response, err error) {
func (sm *mockServiceMetricsGetter) Get(url string) (resp *http.Response, err error) {
// Record the URL that we were called with.
sm.reqURL = url

response := &http.Response{}
response.Body = ioutil.NopCloser(bytes.NewReader([]byte("service metrics\n")))
sm.url = url
response.StatusCode = sm.respStatusCode

return response, nil
}

func TestMergedMetricsServer(t *testing.T) {
cases := []struct {
name string
runEnvoyMetricsServer bool
runServiceMetricsServer bool
expectedOutput string
name string
envoyMetricsGetter *mockEnvoyMetricsGetter
serviceMetricsGetter *mockServiceMetricsGetter
expectedStatusCode int
expectedOutput string
}{
{
name: "happy path: envoy and service metrics are merged",
runEnvoyMetricsServer: true,
runServiceMetricsServer: true,
expectedOutput: "envoy metrics\nservice metrics\n",
name: "happy path: envoy and service metrics are merged",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 200,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 200,
},
expectedStatusCode: 200,
expectedOutput: "envoy metrics\nservice metrics\nconsul_metrics_merging_service_metrics_success 1\n",
},
{
name: "no service metrics",
runEnvoyMetricsServer: true,
runServiceMetricsServer: false,
expectedOutput: "envoy metrics\n",
name: "service metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 200,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 404,
},
expectedStatusCode: 200,
expectedOutput: "envoy metrics\nconsul_metrics_merging_service_metrics_success 0\n",
},
{
name: "no envoy metrics",
runEnvoyMetricsServer: false,
runServiceMetricsServer: true,
expectedOutput: "",
name: "envoy metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 404,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 200,
},
expectedStatusCode: 500,
expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 404: envoy metrics\n\n",
},
{
name: "envoy and service metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 500,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 500,
},
expectedStatusCode: 500,
expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 500: envoy metrics\n\n",
},
}

Expand All @@ -272,21 +310,11 @@ func TestMergedMetricsServer(t *testing.T) {
flagServiceMetricsPort: fmt.Sprint(randomPorts[1]),
flagServiceMetricsPath: "/metrics",
logger: hclog.Default(),
envoyMetricsGetter: c.envoyMetricsGetter,
serviceMetricsGetter: c.serviceMetricsGetter,
}

server := cmd.createMergedMetricsServer()

// Override the cmd's envoyMetricsGetter and serviceMetricsGetter
// with stubs.
em := &envoyMetrics{}
sm := &serviceMetrics{}
if c.runEnvoyMetricsServer {
cmd.envoyMetricsGetter = em
}
if c.runServiceMetricsServer {
cmd.serviceMetricsGetter = sm
}

go func() {
_ = server.ListenAndServe()
}()
Expand All @@ -304,8 +332,8 @@ func TestMergedMetricsServer(t *testing.T) {
// Verify the correct service metrics url was used. The service
// metrics endpoint is only called if the Envoy metrics endpoint
// call succeeds.
if c.runServiceMetricsServer && c.runEnvoyMetricsServer {
require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), sm.url)
if c.envoyMetricsGetter.respStatusCode == 200 {
require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), c.serviceMetricsGetter.reqURL)
}
})
})
Expand Down

0 comments on commit a109f30

Please sign in to comment.