diff --git a/agent/xds/clusters.go b/agent/xds/clusters.go index 5cb2bff43ed3..f74b6e5dd09d 100644 --- a/agent/xds/clusters.go +++ b/agent/xds/clusters.go @@ -360,8 +360,7 @@ func (s *Server) makeUpstreamClusterForPreparedQuery(upstream structs.Upstream, CircuitBreakers: &envoycluster.CircuitBreakers{ Thresholds: makeThresholdsIfNeeded(cfg.Limits), }, - // Having an empty config enables outlier detection with default config. - OutlierDetection: &envoycluster.OutlierDetection{}, + OutlierDetection: cfg.PassiveHealthCheck.AsOutlierDetection(), } if cfg.Protocol == "http2" || cfg.Protocol == "grpc" { c.Http2ProtocolOptions = &envoycore.Http2ProtocolOptions{} @@ -462,8 +461,7 @@ func (s *Server) makeUpstreamClustersForDiscoveryChain( CircuitBreakers: &envoycluster.CircuitBreakers{ Thresholds: makeThresholdsIfNeeded(cfg.Limits), }, - // Having an empty config enables outlier detection with default config. - OutlierDetection: &envoycluster.OutlierDetection{}, + OutlierDetection: cfg.PassiveHealthCheck.AsOutlierDetection(), } proto := cfg.Protocol diff --git a/agent/xds/config.go b/agent/xds/config.go index a97161e4db58..756ca51f44f3 100644 --- a/agent/xds/config.go +++ b/agent/xds/config.go @@ -1,10 +1,13 @@ package xds import ( - "github.com/hashicorp/consul/lib" "strings" + "time" + envoycluster "github.com/envoyproxy/go-control-plane/envoy/api/v2/cluster" + "github.com/gogo/protobuf/types" "github.com/hashicorp/consul/agent/structs" + "github.com/hashicorp/consul/lib" "github.com/mitchellh/mapstructure" ) @@ -160,11 +163,48 @@ type UpstreamConfig struct { // Limits are the set of limits that are applied to the proxy for a specific upstream of a // service instance. Limits UpstreamLimits `mapstructure:"limits"` + + // PassiveHealthCheck configuration + PassiveHealthCheck PassiveHealthCheck `mapstructure:"passive_health_check"` +} + +type PassiveHealthCheck struct { + // Interval between health check analysis sweeps. Each sweep may remove + // hosts or return hosts to the pool. + Interval time.Duration + // MaxFailures is the count of consecutive failures that results in a host + // being removed from the pool. + MaxFailures uint32 `mapstructure:"max_failures"` +} + +// Return an envoy.OutlierDetection populated by the values from this struct. +// If all values are zero a default empty OutlierDetection will be returned to +// enable outlier detection with default values. +func (p PassiveHealthCheck) AsOutlierDetection() *envoycluster.OutlierDetection { + od := &envoycluster.OutlierDetection{} + if p.Interval != 0 { + od.Interval = types.DurationProto(p.Interval) + } + if p.MaxFailures != 0 { + od.Consecutive_5Xx = &types.UInt32Value{Value: p.MaxFailures} + } + return od } func ParseUpstreamConfigNoDefaults(m map[string]interface{}) (UpstreamConfig, error) { var cfg UpstreamConfig - err := mapstructure.WeakDecode(m, &cfg) + config := &mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + Result: &cfg, + WeaklyTypedInput: true, + } + + decoder, err := mapstructure.NewDecoder(config) + if err != nil { + return cfg, err + } + + err = decoder.Decode(m) return cfg, err } diff --git a/agent/xds/config_test.go b/agent/xds/config_test.go index 0957e6d6ed5f..23629089e8be 100644 --- a/agent/xds/config_test.go +++ b/agent/xds/config_test.go @@ -1,9 +1,10 @@ package xds import ( - "github.com/hashicorp/consul/agent/structs" "testing" + "time" + "github.com/hashicorp/consul/agent/structs" "github.com/stretchr/testify/require" ) @@ -244,6 +245,23 @@ func TestParseUpstreamConfig(t *testing.T) { }, }, }, + { + name: "passive health check map", + input: map[string]interface{}{ + "passive_health_check": map[string]interface{}{ + "interval": "22s", + "max_failures": 7, + }, + }, + want: UpstreamConfig{ + ConnectTimeoutMs: 5000, + Protocol: "tcp", + PassiveHealthCheck: PassiveHealthCheck{ + Interval: 22 * time.Second, + MaxFailures: 7, + }, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/test/integration/connect/envoy/case-centralconf/s1.hcl b/test/integration/connect/envoy/case-centralconf/s1.hcl index 7f48b1c515a1..0d8957c000b5 100644 --- a/test/integration/connect/envoy/case-centralconf/s1.hcl +++ b/test/integration/connect/envoy/case-centralconf/s1.hcl @@ -13,4 +13,4 @@ services { } } } -} \ No newline at end of file +} diff --git a/test/integration/connect/envoy/case-upstream-connection-limits/s1.hcl b/test/integration/connect/envoy/case-upstream-config/s1.hcl similarity index 78% rename from test/integration/connect/envoy/case-upstream-connection-limits/s1.hcl rename to test/integration/connect/envoy/case-upstream-config/s1.hcl index 3bdf91c05878..b3d4db73b8fd 100644 --- a/test/integration/connect/envoy/case-upstream-connection-limits/s1.hcl +++ b/test/integration/connect/envoy/case-upstream-config/s1.hcl @@ -14,6 +14,10 @@ services { max_pending_requests = 4 max_concurrent_requests = 5 } + passive_health_check { + interval = "22s" + max_failures = 4 + } } } ] diff --git a/test/integration/connect/envoy/case-upstream-connection-limits/s2.hcl b/test/integration/connect/envoy/case-upstream-config/s2.hcl similarity index 100% rename from test/integration/connect/envoy/case-upstream-connection-limits/s2.hcl rename to test/integration/connect/envoy/case-upstream-config/s2.hcl diff --git a/test/integration/connect/envoy/case-upstream-connection-limits/setup.sh b/test/integration/connect/envoy/case-upstream-config/setup.sh similarity index 100% rename from test/integration/connect/envoy/case-upstream-connection-limits/setup.sh rename to test/integration/connect/envoy/case-upstream-config/setup.sh diff --git a/test/integration/connect/envoy/case-upstream-connection-limits/verify.bats b/test/integration/connect/envoy/case-upstream-config/verify.bats similarity index 72% rename from test/integration/connect/envoy/case-upstream-connection-limits/verify.bats rename to test/integration/connect/envoy/case-upstream-config/verify.bats index 116f78cc176a..a6c025ac89a6 100644 --- a/test/integration/connect/envoy/case-upstream-connection-limits/verify.bats +++ b/test/integration/connect/envoy/case-upstream-config/verify.bats @@ -27,7 +27,7 @@ load helpers } @test "s1 proxy should have been configured with max_connections on the cluster" { - CLUSTER_THRESHOLD=$(get_envoy_cluster_threshold localhost:19000 s2.default.primary) + CLUSTER_THRESHOLD=$(get_envoy_cluster_config localhost:19000 s2.default.primary | jq '.circuit_breakers.thresholds[0]') echo $CLUSTER_THRESHOLD MAX_CONNS=$(echo $CLUSTER_THRESHOLD | jq --raw-output '.max_connections') @@ -42,3 +42,11 @@ load helpers [ "$MAX_PENDING_REQS" = "4" ] [ "$MAX_REQS" = "5" ] } + +@test "s1 proxy should have been configured with passive_health_check" { + CLUSTER_CONFIG=$(get_envoy_cluster_config localhost:19000 s2.default.primary) + echo $CLUSTER_CONFIG + + [ "$(echo $CLUSTER_CONFIG | jq --raw-output '.outlier_detection.consecutive_5xx')" = "4" ] + [ "$(echo $CLUSTER_CONFIG | jq --raw-output '.outlier_detection.interval')" = "22s" ] +} diff --git a/test/integration/connect/envoy/helpers.bash b/test/integration/connect/envoy/helpers.bash index 94f4ca59f1bc..3f8cbd5cc9c5 100755 --- a/test/integration/connect/envoy/helpers.bash +++ b/test/integration/connect/envoy/helpers.bash @@ -156,7 +156,7 @@ function get_envoy_listener_filters { echo "$output" | jq --raw-output "$QUERY" } -function get_envoy_cluster_threshold { +function get_envoy_cluster_config { local HOSTPORT=$1 local CLUSTER_NAME=$2 run retry_default curl -s -f $HOSTPORT/config_dump @@ -164,7 +164,7 @@ function get_envoy_cluster_threshold { echo "$output" | jq --raw-output " .configs[1].dynamic_active_clusters[] | select(.cluster.name|startswith(\"${CLUSTER_NAME}\")) - | .cluster.circuit_breakers.thresholds[0] + | .cluster " } diff --git a/website/pages/docs/connect/proxies/envoy.mdx b/website/pages/docs/connect/proxies/envoy.mdx index a028fb3ba7cb..8209ea7ea7e0 100644 --- a/website/pages/docs/connect/proxies/envoy.mdx +++ b/website/pages/docs/connect/proxies/envoy.mdx @@ -277,6 +277,17 @@ definition](/docs/connect/registration/service-registration) or since HTTP/2 has many requests per connection. For this configuration to be respected, a L7 protocol must be defined in the `protocol` field. +- `passive_health_check` - Passive health checks are used to remove hosts from + the upstream cluster which are unreachable or are returning errors. + + - `interval` - The time between checks. Each check will cause hosts which + have exceeded `max_failures` to be removed from the load balancer, and + any hosts which have passed their ejection time to be returned to the + load balancer. + - `max_failures` - The number of consecutive failures which cause a host to be + removed from the load balancer. + + ### Gateway Options These fields may also be overridden explicitly in the [proxy service