Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add proxy startup and liveness probe config. #3450

Merged
merged 1 commit into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/3450.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
control-plane: Add new `consul.hashicorp.com/sidecar-proxy-startup-failure-seconds` and `consul.hashicorp.com/sidecar-proxy-liveness-failure-seconds` annotations that allow users to manually configure startup and liveness probes for Envoy sidecar proxies.
```
3 changes: 2 additions & 1 deletion charts/consul/templates/connect-inject-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ spec:
-default-sidecar-proxy-lifecycle-shutdown-grace-period-seconds={{ .Values.connectInject.sidecarProxy.lifecycle.defaultShutdownGracePeriodSeconds }} \
-default-sidecar-proxy-lifecycle-graceful-port={{ .Values.connectInject.sidecarProxy.lifecycle.defaultGracefulPort }} \
-default-sidecar-proxy-lifecycle-graceful-shutdown-path="{{ .Values.connectInject.sidecarProxy.lifecycle.defaultGracefulShutdownPath }}" \

-default-sidecar-proxy-startup-failure-seconds={{ .Values.connectInject.sidecarProxy.defaultStartupFailureSeconds }} \
-default-sidecar-proxy-liveness-failure-seconds={{ .Values.connectInject.sidecarProxy.defaultLivenessFailureSeconds }} \
{{- if .Values.connectInject.initContainer }}
{{- $initResources := .Values.connectInject.initContainer.resources }}
{{- if not (kindIs "invalid" $initResources.limits.memory) }}
Expand Down
7 changes: 7 additions & 0 deletions charts/consul/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2709,6 +2709,13 @@ connectInject:
# @type: string
defaultGracefulShutdownPath: "/graceful_shutdown"

# Configures how long the k8s startup probe will wait before the proxy is considered to be unhealthy and the container is restarted.
# A value of zero disables the probe.
defaultStartupFailureSeconds: 0
# Configures how long the k8s liveness probe will wait before the proxy is considered to be unhealthy and the container is restarted.
# A value of zero disables the probe.
defaultLivenessFailureSeconds: 0

# The resource settings for the Connect injected init container. If null, the resources
# won't be set for the initContainer. The defaults are optimized for developer instances of
# Kubernetes, however they should be tweaked with the recommended defaults as shown below to speed up service registration times.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ const (
// Enable this only if the application does not support health checks.
AnnotationUseProxyHealthCheck = "consul.hashicorp.com/use-proxy-health-check"

// AnnotationSidecarProxyStartupFailureSeconds configures how long the k8s startup probe will wait for
// success before the proxy is considered to be unhealthy and the container is restarted.
AnnotationSidecarProxyStartupFailureSeconds = "consul.hashicorp.com/sidecar-proxy-startup-failure-seconds"

// AnnotationSidecarProxyLivenessFailureSeconds configures how long the k8s liveness probe will wait for
// before the proxy is considered to be unhealthy and the container is restarted.
AnnotationSidecarProxyLivenessFailureSeconds = "consul.hashicorp.com/sidecar-proxy-liveness-failure-seconds"

// annotations for sidecar proxy resource limits.
AnnotationSidecarProxyCPULimit = "consul.hashicorp.com/sidecar-proxy-cpu-limit"
AnnotationSidecarProxyCPURequest = "consul.hashicorp.com/sidecar-proxy-cpu-request"
Expand Down
57 changes: 53 additions & 4 deletions control-plane/connect-inject/webhook/consul_dataplane_sidecar.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ func (w *MeshWebhook) consulDataplaneSidecar(namespace corev1.Namespace, pod cor
containerName = fmt.Sprintf("%s-%s", sidecarContainer, mpi.serviceName)
}

var probe *corev1.Probe
var readinessProbe *corev1.Probe
if useProxyHealthCheck(pod) {
// If using the proxy health check for a service, configure an HTTP handler
// that queries the '/ready' endpoint of the proxy.
probe = &corev1.Probe{
readinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(constants.ProxyDefaultHealthPort + mpi.serviceIndex),
Expand All @@ -65,7 +65,7 @@ func (w *MeshWebhook) consulDataplaneSidecar(namespace corev1.Namespace, pod cor
InitialDelaySeconds: 1,
}
} else {
probe = &corev1.Probe{
readinessProbe = &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
TCPSocket: &corev1.TCPSocketAction{
Port: intstr.FromInt(constants.ProxyDefaultInboundPort + mpi.serviceIndex),
Expand All @@ -75,6 +75,27 @@ func (w *MeshWebhook) consulDataplaneSidecar(namespace corev1.Namespace, pod cor
}
}

// Configure optional probes on the proxy to force restart it in failure scenarios.
var startupProbe, livenessProbe *corev1.Probe
startupSeconds := w.getStartupFailureSeconds(pod)
livenessSeconds := w.getLivenessFailureSeconds(pod)
if startupSeconds > 0 {
startupProbe = &corev1.Probe{
// Use the same handler as the readiness probe.
ProbeHandler: readinessProbe.ProbeHandler,
PeriodSeconds: 1,
FailureThreshold: startupSeconds,
}
}
if livenessSeconds > 0 {
livenessProbe = &corev1.Probe{
// Use the same handler as the readiness probe.
ProbeHandler: readinessProbe.ProbeHandler,
PeriodSeconds: 1,
FailureThreshold: livenessSeconds,
}
}

container := corev1.Container{
Name: containerName,
Image: w.ImageConsulDataplane,
Expand Down Expand Up @@ -140,7 +161,9 @@ func (w *MeshWebhook) consulDataplaneSidecar(namespace corev1.Namespace, pod cor
},
},
Args: args,
ReadinessProbe: probe,
ReadinessProbe: readinessProbe,
StartupProbe: startupProbe,
LivenessProbe: livenessProbe,
}

if w.AuthMethod != "" {
Expand Down Expand Up @@ -520,6 +543,32 @@ func useProxyHealthCheck(pod corev1.Pod) bool {
return false
}

// getStartupFailureSeconds returns number of seconds configured by the annotation 'consul.hashicorp.com/sidecar-proxy-startup-failure-seconds'
// and indicates how long we should wait for the sidecar proxy to initialize before considering the pod unhealthy.
func (w *MeshWebhook) getStartupFailureSeconds(pod corev1.Pod) int32 {
seconds := w.DefaultSidecarProxyStartupFailureSeconds
if v, ok := pod.Annotations[constants.AnnotationSidecarProxyStartupFailureSeconds]; ok {
seconds, _ = strconv.Atoi(v)
}
if seconds > 0 {
return int32(seconds)
}
return 0
}

// getLivenessFailureSeconds returns number of seconds configured by the annotation 'consul.hashicorp.com/sidecar-proxy-liveness-failure-seconds'
// and indicates how long we should wait for the sidecar proxy to initialize before considering the pod unhealthy.
func (w *MeshWebhook) getLivenessFailureSeconds(pod corev1.Pod) int32 {
seconds := w.DefaultSidecarProxyLivenessFailureSeconds
if v, ok := pod.Annotations[constants.AnnotationSidecarProxyLivenessFailureSeconds]; ok {
seconds, _ = strconv.Atoi(v)
}
if seconds > 0 {
return int32(seconds)
}
return 0
}

// getMetricsPorts creates container ports for exposing services such as prometheus.
// Prometheus in particular needs a named port for use with the operator.
// https://github.com/hashicorp/consul-k8s/pull/1440
Expand Down
163 changes: 129 additions & 34 deletions control-plane/connect-inject/webhook/consul_dataplane_sidecar_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -419,48 +419,143 @@ func TestHandlerConsulDataplaneSidecar_DNSProxy(t *testing.T) {
}

func TestHandlerConsulDataplaneSidecar_ProxyHealthCheck(t *testing.T) {
h := MeshWebhook{
ConsulConfig: &consul.Config{HTTPPort: 8500, GRPCPort: 8502},
ConsulAddress: "1.1.1.1",
LogLevel: "info",
}
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
constants.AnnotationUseProxyHealthCheck: "true",
tests := map[string]struct {
changeHook func(*MeshWebhook)
changePod func(*corev1.Pod)
expectedReadiness *corev1.Probe
expectedStartup *corev1.Probe
expectedLiveness *corev1.Probe
}{
"readiness-only": {
changeHook: func(h *MeshWebhook) {},
changePod: func(p *corev1.Pod) {},
expectedReadiness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
InitialDelaySeconds: 1,
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "web",
"default-values": {
changeHook: func(h *MeshWebhook) {
h.DefaultSidecarProxyStartupFailureSeconds = 11
h.DefaultSidecarProxyLivenessFailureSeconds = 22
},
changePod: func(p *corev1.Pod) {},
expectedReadiness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
InitialDelaySeconds: 1,
},
expectedStartup: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
PeriodSeconds: 1,
FailureThreshold: 11,
},
expectedLiveness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
PeriodSeconds: 1,
FailureThreshold: 22,
},
},
}
container, err := h.consulDataplaneSidecar(testNS, pod, multiPortInfo{})
expectedProbe := &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
"override-default": {
changeHook: func(h *MeshWebhook) {
h.DefaultSidecarProxyStartupFailureSeconds = 11
h.DefaultSidecarProxyLivenessFailureSeconds = 22
},
changePod: func(p *corev1.Pod) {
p.ObjectMeta.Annotations[constants.AnnotationSidecarProxyStartupFailureSeconds] = "111"
p.ObjectMeta.Annotations[constants.AnnotationSidecarProxyLivenessFailureSeconds] = "222"
},
expectedReadiness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
InitialDelaySeconds: 1,
},
expectedStartup: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
PeriodSeconds: 1,
FailureThreshold: 111,
},
expectedLiveness: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
HTTPGet: &corev1.HTTPGetAction{
Port: intstr.FromInt(21000),
Path: "/ready",
},
},
PeriodSeconds: 1,
FailureThreshold: 222,
},
},
InitialDelaySeconds: 1,
}
require.NoError(t, err)
require.Contains(t, container.Args, "-envoy-ready-bind-port=21000")
require.Equal(t, expectedProbe, container.ReadinessProbe)
require.Contains(t, container.Env, corev1.EnvVar{
Name: "DP_ENVOY_READY_BIND_ADDRESS",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{FieldPath: "status.podIP"},
},
})
require.Contains(t, container.Ports, corev1.ContainerPort{
Name: "proxy-health-0",
ContainerPort: 21000,
})
for tn, tc := range tests {
t.Run(tn, func(t *testing.T) {
hook := MeshWebhook{
ConsulConfig: &consul.Config{HTTPPort: 8500, GRPCPort: 8502},
ConsulAddress: "1.1.1.1",
LogLevel: "info",
}
pod := corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{
constants.AnnotationUseProxyHealthCheck: "true",
},
},
Spec: corev1.PodSpec{
Containers: []corev1.Container{
{
Name: "web",
},
},
},
}
tc.changeHook(&hook)
tc.changePod(&pod)
container, err := hook.consulDataplaneSidecar(testNS, pod, multiPortInfo{})
require.NoError(t, err)
require.Contains(t, container.Args, "-envoy-ready-bind-port=21000")
require.Equal(t, tc.expectedReadiness, container.ReadinessProbe)
require.Equal(t, tc.expectedStartup, container.StartupProbe)
require.Equal(t, tc.expectedLiveness, container.LivenessProbe)
require.Contains(t, container.Env, corev1.EnvVar{
Name: "DP_ENVOY_READY_BIND_ADDRESS",
ValueFrom: &corev1.EnvVarSource{
FieldRef: &corev1.ObjectFieldSelector{FieldPath: "status.podIP"},
},
})
require.Contains(t, container.Ports, corev1.ContainerPort{
Name: "proxy-health-0",
ContainerPort: 21000,
})
})
}
}

func TestHandlerConsulDataplaneSidecar_ProxyHealthCheck_Multiport(t *testing.T) {
Expand Down
3 changes: 3 additions & 0 deletions control-plane/connect-inject/webhook/mesh_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ type MeshWebhook struct {
DefaultProxyMemoryRequest resource.Quantity
DefaultProxyMemoryLimit resource.Quantity

DefaultSidecarProxyStartupFailureSeconds int
DefaultSidecarProxyLivenessFailureSeconds int

// LifecycleConfig contains proxy lifecycle management configuration from the inject-connect command and has methods to determine whether
// configuration should come from the default flags or annotations. The meshWebhook uses this to configure container sidecar proxy args.
LifecycleConfig lifecycle.Config
Expand Down
6 changes: 6 additions & 0 deletions control-plane/subcommand/inject-connect/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@ type Command struct {
flagDefaultSidecarProxyLifecycleGracefulPort string
flagDefaultSidecarProxyLifecycleGracefulShutdownPath string

flagDefaultSidecarProxyStartupFailureSeconds int
flagDefaultSidecarProxyLivenessFailureSeconds int

// Metrics settings.
flagDefaultEnableMetrics bool
flagEnableGatewayMetrics bool
Expand Down Expand Up @@ -249,6 +252,9 @@ func (c *Command) init() {
c.flagSet.StringVar(&c.flagDefaultSidecarProxyLifecycleGracefulPort, "default-sidecar-proxy-lifecycle-graceful-port", strconv.Itoa(constants.DefaultGracefulPort), "Default port for sidecar proxy lifecycle management HTTP endpoints.")
c.flagSet.StringVar(&c.flagDefaultSidecarProxyLifecycleGracefulShutdownPath, "default-sidecar-proxy-lifecycle-graceful-shutdown-path", "/graceful_shutdown", "Default sidecar proxy lifecycle management graceful shutdown path.")

c.flagSet.IntVar(&c.flagDefaultSidecarProxyStartupFailureSeconds, "default-sidecar-proxy-startup-failure-seconds", 0, "Default number of seconds for the k8s startup probe to fail before the proxy container is restarted. Zero disables the probe.")
c.flagSet.IntVar(&c.flagDefaultSidecarProxyLivenessFailureSeconds, "default-sidecar-proxy-liveness-failure-seconds", 0, "Default number of seconds for the k8s liveness probe to fail before the proxy container is restarted. Zero disables the probe.")

// Metrics setting flags.
c.flagSet.BoolVar(&c.flagDefaultEnableMetrics, "default-enable-metrics", false, "Default for enabling connect service metrics.")
c.flagSet.BoolVar(&c.flagEnableGatewayMetrics, "enable-gateway-metrics", false, "Allows enabling Consul gateway metrics.")
Expand Down
Loading
Loading