Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change error handling when scraping metrics #551

Merged
merged 1 commit into from
Dec 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
BREAKING CHANGES:
* Control Plane
* Update minimum go version for project to 1.17 [[GH-878](https://github.com/hashicorp/consul-k8s/pull/878)]
* Add boolean metric to merged metrics response `consul_merged_service_metrics_success` to indicate if service metrics were
scraped successfully. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)]

IMPROVEMENTS:
* CLI
Expand All @@ -16,6 +18,8 @@ IMPROVEMENTS:
BUG FIXES:
* Control Plane:
* Add a workaround to check that the ACL token is replicated to other Consul servers. [[GH-862](https://github.com/hashicorp/consul-k8s/issues/862)]
* Return 500 on prometheus response if unable to get metrics from Envoy. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)]
* Don't include body of failed service metrics calls in merged metrics response. [[GH-551](https://github.com/hashicorp/consul-k8s/pull/551)]
* Helm Chart
* Admin Partitions **(Consul Enterprise only)**: Do not mount Consul CA certs to partition-init job if `externalServers.useSystemRoots` is `true`. [[GH-885](https://github.com/hashicorp/consul-k8s/pull/885)]

Expand Down
80 changes: 63 additions & 17 deletions control-plane/subcommand/consul-sidecar/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ import (
"github.com/mitchellh/cli"
)

const metricsServerShutdownTimeout = 5 * time.Second
const envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus"
const (
metricsServerShutdownTimeout = 5 * time.Second
envoyMetricsAddr = "http://127.0.0.1:19000/stats/prometheus"
// prometheusServiceMetricsSuccessKey is the key of the prometheus metric used to
// indicate if service metrics were scraped successfully.
prometheusServiceMetricsSuccessKey = "consul_merged_service_metrics_success"
)

type Command struct {
UI cli.Ui
Expand Down Expand Up @@ -240,27 +245,36 @@ func (c *Command) createMergedMetricsServer() *http.Server {
mergedMetricsServerAddr := fmt.Sprintf("127.0.0.1:%s", c.flagMergedMetricsPort)
server := &http.Server{Addr: mergedMetricsServerAddr, Handler: mux}

// http.Client satisfies the metricsGetter interface.
// The default http.Client timeout is indefinite, so adding a timeout makes
// sure that requests don't hang.
client := &http.Client{
Timeout: time.Second * 10,
}
// http.Client satisfies the metricsGetter interface.
c.envoyMetricsGetter = client
c.serviceMetricsGetter = client

// During tests these may already be set to mocks.
if c.envoyMetricsGetter == nil {
c.envoyMetricsGetter = client
}
if c.serviceMetricsGetter == nil {
c.serviceMetricsGetter = client
}

return server
}

// mergedMetricsHandler has the logic to append both Envoy and service metrics
// together, logging if it's unsuccessful at either.
// If the Envoy scrape fails, we respond with a 500 code which follows the Prometheus
// exporter guidelines. If the service scrape fails, we respond with a 200 so
// that the Envoy metrics are still scraped.
// We also include a metric line in each response indicating the success or
// failure of the service metric scraping.
func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request) {

envoyMetrics, err := c.envoyMetricsGetter.Get(envoyMetricsAddr)
if err != nil {
// If there is an error scraping Envoy, we want the handler to return
// without writing anything to the response, and log the error.
c.logger.Error(fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err.Error()))
c.logger.Error("Error scraping Envoy proxy metrics", "err", err)
http.Error(rw, fmt.Sprintf("Error scraping Envoy proxy metrics: %s", err), http.StatusInternalServerError)
return
}

Expand All @@ -273,18 +287,22 @@ func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request)
}()
envoyMetricsBody, err := ioutil.ReadAll(envoyMetrics.Body)
if err != nil {
c.logger.Error(fmt.Sprintf("Couldn't read Envoy proxy metrics: %s", err.Error()))
c.logger.Error("Could not read Envoy proxy metrics", "err", err)
http.Error(rw, fmt.Sprintf("Could not read Envoy proxy metrics: %s", err), http.StatusInternalServerError)
return
}
_, err = rw.Write(envoyMetricsBody)
if err != nil {
c.logger.Error(fmt.Sprintf("Error writing envoy metrics body: %s", err.Error()))
if non2xxCode(envoyMetrics.StatusCode) {
c.logger.Error("Received non-2xx status code scraping Envoy proxy metrics", "code", envoyMetrics.StatusCode, "response", string(envoyMetricsBody))
http.Error(rw, fmt.Sprintf("Received non-2xx status code scraping Envoy proxy metrics: %d: %s", envoyMetrics.StatusCode, string(envoyMetricsBody)), http.StatusInternalServerError)
return
}
writeResponse(rw, envoyMetricsBody, "envoy metrics", c.logger)

serviceMetricsAddr := fmt.Sprintf("http://127.0.0.1:%s%s", c.flagServiceMetricsPort, c.flagServiceMetricsPath)
serviceMetrics, err := c.serviceMetricsGetter.Get(serviceMetricsAddr)
if err != nil {
c.logger.Warn(fmt.Sprintf("Error scraping service metrics: %s", err.Error()))
c.logger.Warn("Error scraping service metrics", "err", err)
writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger)
// Since we've already written the Envoy metrics to the response, we can
// return at this point if we were unable to get service metrics.
return
Expand All @@ -300,12 +318,25 @@ func (c *Command) mergedMetricsHandler(rw http.ResponseWriter, _ *http.Request)
}()
serviceMetricsBody, err := ioutil.ReadAll(serviceMetrics.Body)
if err != nil {
c.logger.Error(fmt.Sprintf("Couldn't read service metrics: %s", err.Error()))
c.logger.Error("Could not read service metrics", "err", err)
writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger)
return
}
_, err = rw.Write(serviceMetricsBody)
if non2xxCode(serviceMetrics.StatusCode) {
c.logger.Error("Received non-2xx status code scraping service metrics", "code", serviceMetrics.StatusCode, "response", string(serviceMetricsBody))
writeResponse(rw, serviceMetricSuccess(false), "service metrics success", c.logger)
return
}
writeResponse(rw, serviceMetricsBody, "service metrics", c.logger)
writeResponse(rw, serviceMetricSuccess(true), "service metrics success", c.logger)
}

// writeResponse is a helper method to write resp to rw and log if there is an error writing.
// respName is the name of this response that will be used in the error log.
func writeResponse(rw http.ResponseWriter, resp []byte, respName string, logger hclog.Logger) {
_, err := rw.Write(resp)
if err != nil {
c.logger.Error(fmt.Sprintf("Error writing service metrics body: %s", err.Error()))
logger.Error(fmt.Sprintf("Error writing %s: %s", respName, err.Error()))
}
}

Expand Down Expand Up @@ -339,6 +370,21 @@ func (c *Command) validateFlags() error {
return nil
}

// non2xxCode returns true if code is not in the range of 200-299 inclusive.
func non2xxCode(code int) bool {
return code < 200 || code >= 300
}

// serviceMetricSuccess returns a prometheus metric line indicating
// the success of the metrics merging.
func serviceMetricSuccess(success bool) []byte {
boolAsInt := 0
if success {
boolAsInt = 1
}
return []byte(fmt.Sprintf("%s %d\n", prometheusServiceMetricsSuccessKey, boolAsInt))
}

// parseConsulFlags creates Consul client command flags
// from command's HTTP flags and returns them as an array of strings.
func (c *Command) parseConsulFlags() []string {
Expand Down
100 changes: 64 additions & 36 deletions control-plane/subcommand/consul-sidecar/command_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,50 +214,88 @@ func TestRunSignalHandlingAllProcessesEnabled(t *testing.T) {
}
}

type envoyMetrics struct {
type mockEnvoyMetricsGetter struct {
respStatusCode int
}

func (em *envoyMetrics) Get(url string) (resp *http.Response, err error) {
func (em *mockEnvoyMetricsGetter) Get(_ string) (resp *http.Response, err error) {
response := &http.Response{}
response.StatusCode = em.respStatusCode
response.Body = ioutil.NopCloser(bytes.NewReader([]byte("envoy metrics\n")))
return response, nil
}

type serviceMetrics struct {
url string
// mockServiceMetricsGetter
type mockServiceMetricsGetter struct {
// reqURL is the last URL that was passed to Get(url)
reqURL string

// respStatusCode is the status code to use for the response.
respStatusCode int
}

func (sm *serviceMetrics) Get(url string) (resp *http.Response, err error) {
func (sm *mockServiceMetricsGetter) Get(url string) (resp *http.Response, err error) {
// Record the URL that we were called with.
sm.reqURL = url

response := &http.Response{}
response.Body = ioutil.NopCloser(bytes.NewReader([]byte("service metrics\n")))
sm.url = url
response.StatusCode = sm.respStatusCode

return response, nil
}

func TestMergedMetricsServer(t *testing.T) {
cases := []struct {
name string
runEnvoyMetricsServer bool
runServiceMetricsServer bool
expectedOutput string
name string
envoyMetricsGetter *mockEnvoyMetricsGetter
serviceMetricsGetter *mockServiceMetricsGetter
expectedStatusCode int
expectedOutput string
}{
{
name: "happy path: envoy and service metrics are merged",
runEnvoyMetricsServer: true,
runServiceMetricsServer: true,
expectedOutput: "envoy metrics\nservice metrics\n",
name: "happy path: envoy and service metrics are merged",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 200,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 200,
},
expectedStatusCode: 200,
expectedOutput: "envoy metrics\nservice metrics\nconsul_merged_service_metrics_success 1\n",
},
{
name: "no service metrics",
runEnvoyMetricsServer: true,
runServiceMetricsServer: false,
expectedOutput: "envoy metrics\n",
name: "service metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 200,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 404,
},
expectedStatusCode: 200,
expectedOutput: "envoy metrics\nconsul_merged_service_metrics_success 0\n",
},
{
name: "no envoy metrics",
runEnvoyMetricsServer: false,
runServiceMetricsServer: true,
expectedOutput: "",
name: "envoy metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 404,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 200,
},
expectedStatusCode: 500,
expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 404: envoy metrics\n\n",
},
{
name: "envoy and service metrics non-200",
envoyMetricsGetter: &mockEnvoyMetricsGetter{
respStatusCode: 500,
},
serviceMetricsGetter: &mockServiceMetricsGetter{
respStatusCode: 500,
},
expectedStatusCode: 500,
expectedOutput: "Received non-2xx status code scraping Envoy proxy metrics: 500: envoy metrics\n\n",
},
}

Expand All @@ -272,21 +310,11 @@ func TestMergedMetricsServer(t *testing.T) {
flagServiceMetricsPort: fmt.Sprint(randomPorts[1]),
flagServiceMetricsPath: "/metrics",
logger: hclog.Default(),
envoyMetricsGetter: c.envoyMetricsGetter,
serviceMetricsGetter: c.serviceMetricsGetter,
}

server := cmd.createMergedMetricsServer()

// Override the cmd's envoyMetricsGetter and serviceMetricsGetter
// with stubs.
em := &envoyMetrics{}
sm := &serviceMetrics{}
if c.runEnvoyMetricsServer {
cmd.envoyMetricsGetter = em
}
if c.runServiceMetricsServer {
cmd.serviceMetricsGetter = sm
}

go func() {
_ = server.ListenAndServe()
}()
Expand All @@ -304,8 +332,8 @@ func TestMergedMetricsServer(t *testing.T) {
// Verify the correct service metrics url was used. The service
// metrics endpoint is only called if the Envoy metrics endpoint
// call succeeds.
if c.runServiceMetricsServer && c.runEnvoyMetricsServer {
require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), sm.url)
if c.envoyMetricsGetter.respStatusCode == 200 {
require.Equal(r, fmt.Sprintf("http://127.0.0.1:%d%s", randomPorts[1], "/metrics"), c.serviceMetricsGetter.reqURL)
}
})
})
Expand Down