From 33268b187608fc81c91b669134da36bbe3d34224 Mon Sep 17 00:00:00 2001 From: Matt Moore Date: Mon, 1 Jun 2020 10:16:06 -0700 Subject: [PATCH] Bump the default periodSeconds to 10s. Fixes: https://github.com/knative/serving/issues/8147 --- cmd/queue/main.go | 1 + .../revision/resources/deploy_test.go | 2 +- pkg/reconciler/revision/resources/queue.go | 18 +++++++++++++----- .../revision/resources/queue_test.go | 2 +- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/cmd/queue/main.go b/cmd/queue/main.go index 50cd04083452..52abdb56a06a 100644 --- a/cmd/queue/main.go +++ b/cmd/queue/main.go @@ -277,6 +277,7 @@ func probeQueueHealthPath(port, timeoutSeconds int, env config) error { } defer res.Body.Close() success := health.IsHTTPProbeReady(res) + // The check for preferForScaledown() fails readiness faster // in the presence of the label if preferScaleDown, err := preferPodForScaledown(env.DownwardAPILabelsPath); err != nil { diff --git a/pkg/reconciler/revision/resources/deploy_test.go b/pkg/reconciler/revision/resources/deploy_test.go index 933ed2cab81a..401442bfc8a8 100644 --- a/pkg/reconciler/revision/resources/deploy_test.go +++ b/pkg/reconciler/revision/resources/deploy_test.go @@ -87,7 +87,7 @@ var ( Command: []string{"/ko-app/queue", "-probe-period", "0"}, }, }, - PeriodSeconds: 1, + PeriodSeconds: 10, TimeoutSeconds: 10, }, SecurityContext: queueSecurityContext, diff --git a/pkg/reconciler/revision/resources/queue.go b/pkg/reconciler/revision/resources/queue.go index 40ed65db976f..18094b400fbb 100644 --- a/pkg/reconciler/revision/resources/queue.go +++ b/pkg/reconciler/revision/resources/queue.go @@ -148,11 +148,19 @@ func makeQueueProbe(in *corev1.Probe) *corev1.Probe { Command: []string{"/ko-app/queue", "-probe-period", "0"}, }, }, - // We want to mark the service as not ready as soon as the - // PreStop handler is called, so we need to check a little - // bit more often than the default. It is a small - // sacrifice for a low rate of 503s. - PeriodSeconds: 1, + // The exec probe enables us to retry failed probes quickly to get sub-second + // resolution and achieve faster cold-starts. However, for draining pods as + // part of the K8s lifecycle this period will bound the tail of how quickly + // we can remove a Pod's endpoint from the K8s service. + // + // The trade-off here is that exec probes cost CPU to run, and for idle pods + // (e.g. due to minScale) we see ~50m/{period} of idle CPU usage in the + // queue-proxy. So while setting this to 1s results in slightly faster drains + // it also means that in the steady state the queue-proxy is consuming 10x + // more CPU due to probes than with a period of 10s. + // + // See also: https://github.com/knative/serving/issues/8147 + PeriodSeconds: 10, // We keep the connection open for a while because we're // actively probing the user-container on that endpoint and // thus don't want to be limited by K8s granularity here. diff --git a/pkg/reconciler/revision/resources/queue_test.go b/pkg/reconciler/revision/resources/queue_test.go index dfa51a277d7d..e83c257dbad3 100644 --- a/pkg/reconciler/revision/resources/queue_test.go +++ b/pkg/reconciler/revision/resources/queue_test.go @@ -621,7 +621,7 @@ func TestTCPProbeGeneration(t *testing.T) { Command: []string{"/ko-app/queue", "-probe-period", "0"}, }, }, - PeriodSeconds: 1, + PeriodSeconds: 10, TimeoutSeconds: 10, } c.Env = env(map[string]string{"USER_PORT": strconv.Itoa(userPort)})