Skip to content

Commit

Permalink
Add proxy startup and liveness probe config. (#3450)
Browse files Browse the repository at this point in the history
This PR adds in two new
`consul.hashicorp.com/sidecar-proxy-startup-failure-seconds` and
`consul.hashicorp.com/sidecar-proxy-liveness-failure-seconds` annotations that
are disabled by default.

When set to a value greater than zero, these configurations will enable their
corresponding startup / liveness probes for the Envoy proxy. This helps to
prevent scenarios where the Envoy proxy would hang and never recover.
  • Loading branch information
hashi-derek committed Jan 11, 2024
1 parent b5e4a1f commit a5e759e
Show file tree
Hide file tree
Showing 8 changed files with 286 additions and 78 deletions.
3 changes: 3 additions & 0 deletions .changelog/3450.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
control-plane: Add new `consul.hashicorp.com/sidecar-proxy-startup-failure-seconds` and `consul.hashicorp.com/sidecar-proxy-liveness-failure-seconds` annotations that allow users to manually configure startup and liveness probes for Envoy sidecar proxies.
```
3 changes: 2 additions & 1 deletion charts/consul/templates/connect-inject-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,8 @@ spec:
-default-sidecar-proxy-lifecycle-shutdown-grace-period-seconds={{ .Values.connectInject.sidecarProxy.lifecycle.defaultShutdownGracePeriodSeconds }} \
-default-sidecar-proxy-lifecycle-graceful-port={{ .Values.connectInject.sidecarProxy.lifecycle.defaultGracefulPort }} \
-default-sidecar-proxy-lifecycle-graceful-shutdown-path="{{ .Values.connectInject.sidecarProxy.lifecycle.defaultGracefulShutdownPath }}" \
-default-sidecar-proxy-startup-failure-seconds={{ .Values.connectInject.sidecarProxy.defaultStartupFailureSeconds }} \
-default-sidecar-proxy-liveness-failure-seconds={{ .Values.connectInject.sidecarProxy.defaultLivenessFailureSeconds }} \
{{- if .Values.connectInject.initContainer }}
{{- $initResources := .Values.connectInject.initContainer.resources }}
{{- if not (kindIs "invalid" $initResources.limits.memory) }}
Expand Down
7 changes: 7 additions & 0 deletions charts/consul/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2552,6 +2552,13 @@ connectInject:
# @type: string
defaultGracefulShutdownPath: "/graceful_shutdown"

# Configures how long the k8s startup probe will wait before the proxy is considered to be unhealthy and the container is restarted.
# A value of zero disables the probe.
defaultStartupFailureSeconds: 0
# Configures how long the k8s liveness probe will wait before the proxy is considered to be unhealthy and the container is restarted.
# A value of zero disables the probe.
defaultLivenessFailureSeconds: 0

# The resource settings for the Connect injected init container. If null, the resources
# won't be set for the initContainer. The defaults are optimized for developer instances of
# Kubernetes, however they should be tweaked with the recommended defaults as shown below to speed up service registration times.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ const (
// Enable this only if the application does not support health checks.
AnnotationUseProxyHealthCheck = "consul.hashicorp.com/use-proxy-health-check"

// AnnotationSidecarProxyStartupFailureSeconds configures how long the k8s startup probe will wait for
// success before the proxy is considered to be unhealthy and the container is restarted.
AnnotationSidecarProxyStartupFailureSeconds = "consul.hashicorp.com/sidecar-proxy-startup-failure-seconds"

// AnnotationSidecarProxyLivenessFailureSeconds configures how long the k8s liveness probe will wait for
// before the proxy is considered to be unhealthy and the container is restarted.
AnnotationSidecarProxyLivenessFailureSeconds = "consul.hashicorp.com/sidecar-proxy-liveness-failure-seconds"

// annotations for sidecar proxy resource limits.
AnnotationSidecarProxyCPULimit = "consul.hashicorp.com/sidecar-proxy-cpu-limit"
AnnotationSidecarProxyCPURequest = "consul.hashicorp.com/sidecar-proxy-cpu-request"
Expand Down
91 changes: 87 additions & 4 deletions control-plane/connect-inject/webhook/consul_dataplane_sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ func (w *MeshWebhook) consulDataplaneSidecar(namespace corev1.Namespace, pod cor
containerName = fmt.Sprintf("%s-%s", sidecarContainer, mpi.serviceName)
}

var probe *corev1.Probe
var readinessProbe *corev1.Probe
if useProxyHealthCheck(pod) {
// If using the proxy health check for a service, configure an HTTP handler
// that queries the '/ready' endpoint of the proxy.
probe = &corev1.Probe{
readinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(constants.ProxyDefaultHealthPort + mpi.serviceIndex),
Expand All @@ -61,7 +61,7 @@ func (w *MeshWebhook) consulDataplaneSidecar(namespace corev1.Namespace, pod cor
InitialDelaySeconds: 1,
}
} else {
probe = &corev1.Probe{
readinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
TCPSocket: &corev1.TCPSocketAction{
Port: intstr.FromInt(constants.ProxyDefaultInboundPort + mpi.serviceIndex),
Expand All @@ -71,6 +71,27 @@ func (w *MeshWebhook) consulDataplaneSidecar(namespace corev1.Namespace, pod cor
}
}

// Configure optional probes on the proxy to force restart it in failure scenarios.
var startupProbe, livenessProbe *corev1.Probe
startupSeconds := w.getStartupFailureSeconds(pod)
livenessSeconds := w.getLivenessFailureSeconds(pod)
if startupSeconds > 0 {
startupProbe = &corev1.Probe{
// Use the same handler as the readiness probe.
ProbeHandler: readinessProbe.ProbeHandler,
PeriodSeconds: 1,
FailureThreshold: startupSeconds,
}
}
if livenessSeconds > 0 {
livenessProbe = &corev1.Probe{
// Use the same handler as the readiness probe.
ProbeHandler: readinessProbe.ProbeHandler,
PeriodSeconds: 1,
FailureThreshold: livenessSeconds,
}
}

container := corev1.Container{
Name: containerName,
Image: w.ImageConsulDataplane,
Expand Down Expand Up @@ -136,7 +157,9 @@ func (w *MeshWebhook) consulDataplaneSidecar(namespace corev1.Namespace, pod cor
},
},
Args: args,
ReadinessProbe: probe,
ReadinessProbe: readinessProbe,
StartupProbe: startupProbe,
LivenessProbe: livenessProbe,
}

if w.AuthMethod != "" {
Expand Down Expand Up @@ -506,3 +529,63 @@ func useProxyHealthCheck(pod corev1.Pod) bool {
}
return false
}

// getStartupFailureSeconds returns number of seconds configured by the annotation 'consul.hashicorp.com/sidecar-proxy-startup-failure-seconds'
// and indicates how long we should wait for the sidecar proxy to initialize before considering the pod unhealthy.
func (w *MeshWebhook) getStartupFailureSeconds(pod corev1.Pod) int32 {
seconds := w.DefaultSidecarProxyStartupFailureSeconds
if v, ok := pod.Annotations[constants.AnnotationSidecarProxyStartupFailureSeconds]; ok {
seconds, _ = strconv.Atoi(v)
}
if seconds > 0 {
return int32(seconds)
}
return 0
}

// getLivenessFailureSeconds returns number of seconds configured by the annotation 'consul.hashicorp.com/sidecar-proxy-liveness-failure-seconds'
// and indicates how long we should wait for the sidecar proxy to initialize before considering the pod unhealthy.
func (w *MeshWebhook) getLivenessFailureSeconds(pod corev1.Pod) int32 {
seconds := w.DefaultSidecarProxyLivenessFailureSeconds
if v, ok := pod.Annotations[constants.AnnotationSidecarProxyLivenessFailureSeconds]; ok {
seconds, _ = strconv.Atoi(v)
}
if seconds > 0 {
return int32(seconds)
}
return 0
}

// getMetricsPorts creates container ports for exposing services such as prometheus.
// Prometheus in particular needs a named port for use with the operator.
// https://github.com/hashicorp/consul-k8s/pull/1440
func (w *MeshWebhook) getMetricsPorts(pod corev1.Pod) ([]corev1.ContainerPort, error) {
enableMetrics, err := w.MetricsConfig.EnableMetrics(pod)
if err != nil {
return nil, fmt.Errorf("error determining if metrics are enabled: %w", err)
}
if !enableMetrics {
return nil, nil
}

prometheusScrapePort, err := w.MetricsConfig.PrometheusScrapePort(pod)
if err != nil {
return nil, fmt.Errorf("error parsing prometheus port from pod: %w", err)
}
if prometheusScrapePort == "" {
return nil, nil
}

port, err := strconv.Atoi(prometheusScrapePort)
if err != nil {
return nil, fmt.Errorf("error parsing prometheus port from pod: %w", err)
}

return []corev1.ContainerPort{
{
Name: "prometheus",
ContainerPort: int32(port),
Protocol: corev1.ProtocolTCP,
},
}, nil
}
163 changes: 129 additions & 34 deletions control-plane/connect-inject/webhook/consul_dataplane_sidecar_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -414,48 +414,143 @@ func TestHandlerConsulDataplaneSidecar_DNSProxy(t *testing.T) {
}

func TestHandlerConsulDataplaneSidecar_ProxyHealthCheck(t *testing.T) {
h := MeshWebhook{
ConsulConfig: &consul.Config{HTTPPort: 8500, GRPCPort: 8502},
ConsulAddress: "1.1.1.1",
LogLevel: "info",
}
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
constants.AnnotationUseProxyHealthCheck: "true",
tests := map[string]struct {
changeHook func(*MeshWebhook)
changePod func(*corev1.Pod)
expectedReadiness *corev1.Probe
expectedStartup *corev1.Probe
expectedLiveness *corev1.Probe
}{
"readiness-only": {
changeHook: func(h *MeshWebhook) {},
changePod: func(p *corev1.Pod) {},
expectedReadiness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
InitialDelaySeconds: 1,
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "web",
"default-values": {
changeHook: func(h *MeshWebhook) {
h.DefaultSidecarProxyStartupFailureSeconds = 11
h.DefaultSidecarProxyLivenessFailureSeconds = 22
},
changePod: func(p *corev1.Pod) {},
expectedReadiness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
InitialDelaySeconds: 1,
},
expectedStartup: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
PeriodSeconds: 1,
FailureThreshold: 11,
},
expectedLiveness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
PeriodSeconds: 1,
FailureThreshold: 22,
},
},
}
container, err := h.consulDataplaneSidecar(testNS, pod, multiPortInfo{})
expectedProbe := &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
"override-default": {
changeHook: func(h *MeshWebhook) {
h.DefaultSidecarProxyStartupFailureSeconds = 11
h.DefaultSidecarProxyLivenessFailureSeconds = 22
},
changePod: func(p *corev1.Pod) {
p.ObjectMeta.Annotations[constants.AnnotationSidecarProxyStartupFailureSeconds] = "111"
p.ObjectMeta.Annotations[constants.AnnotationSidecarProxyLivenessFailureSeconds] = "222"
},
expectedReadiness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
InitialDelaySeconds: 1,
},
expectedStartup: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
PeriodSeconds: 1,
FailureThreshold: 111,
},
expectedLiveness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
PeriodSeconds: 1,
FailureThreshold: 222,
},
},
InitialDelaySeconds: 1,
}
require.NoError(t, err)
require.Contains(t, container.Args, "-envoy-ready-bind-port=21000")
require.Equal(t, expectedProbe, container.ReadinessProbe)
require.Contains(t, container.Env, corev1.EnvVar{
Name: "DP_ENVOY_READY_BIND_ADDRESS",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{FieldPath: "status.podIP"},
},
})
require.Contains(t, container.Ports, corev1.ContainerPort{
Name: "proxy-health-0",
ContainerPort: 21000,
})
for tn, tc := range tests {
t.Run(tn, func(t *testing.T) {
hook := MeshWebhook{
ConsulConfig: &consul.Config{HTTPPort: 8500, GRPCPort: 8502},
ConsulAddress: "1.1.1.1",
LogLevel: "info",
}
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
constants.AnnotationUseProxyHealthCheck: "true",
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "web",
},
},
},
}
tc.changeHook(&hook)
tc.changePod(&pod)
container, err := hook.consulDataplaneSidecar(testNS, pod, multiPortInfo{})
require.NoError(t, err)
require.Contains(t, container.Args, "-envoy-ready-bind-port=21000")
require.Equal(t, tc.expectedReadiness, container.ReadinessProbe)
require.Equal(t, tc.expectedStartup, container.StartupProbe)
require.Equal(t, tc.expectedLiveness, container.LivenessProbe)
require.Contains(t, container.Env, corev1.EnvVar{
Name: "DP_ENVOY_READY_BIND_ADDRESS",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{FieldPath: "status.podIP"},
},
})
require.Contains(t, container.Ports, corev1.ContainerPort{
Name: "proxy-health-0",
ContainerPort: 21000,
})
})
}
}

func TestHandlerConsulDataplaneSidecar_ProxyHealthCheck_Multiport(t *testing.T) {
Expand Down
3 changes: 3 additions & 0 deletions control-plane/connect-inject/webhook/mesh_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ type MeshWebhook struct {
DefaultProxyMemoryRequest resource.Quantity
DefaultProxyMemoryLimit resource.Quantity

DefaultSidecarProxyStartupFailureSeconds int
DefaultSidecarProxyLivenessFailureSeconds int

// LifecycleConfig contains proxy lifecycle management configuration from the inject-connect command and has methods to determine whether
// configuration should come from the default flags or annotations. The meshWebhook uses this to configure container sidecar proxy args.
LifecycleConfig lifecycle.Config
Expand Down
Loading

0 comments on commit a5e759e

Please sign in to comment.