From 0137747921197983f018b607647c856c83b8fec6 Mon Sep 17 00:00:00 2001 From: Zach Loafman Date: Wed, 17 May 2023 14:04:34 -0700 Subject: [PATCH] Move back to FailureThreshold failures of /gshealthz (#3160) This again reverts part of #3072. In looking at recent failures, in cases where the sidecar is slow to come up due to networking issues, /gshealthz fails, resulting often in the unnecessary restart of the game server. This returns to the more generous failure threshold from prior to to just make this ~infinite, i.e. neuter kubelet liveness, since health is really owned by the controller and sidecar. --- pkg/gameservers/controller.go | 14 +++++++++----- pkg/gameservers/controller_test.go | 4 ++-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pkg/gameservers/controller.go b/pkg/gameservers/controller.go index e6faaf191c..0ae33ffd8e 100644 --- a/pkg/gameservers/controller.go +++ b/pkg/gameservers/controller.go @@ -706,15 +706,19 @@ func (c *Controller) addGameServerHealthCheck(gs *agonesv1.GameServer, pod *core }, }, // The sidecar relies on kubelet to delay by InitialDelaySeconds after the - // container is started (after image pull, etc), and relies on the kubelet - // for PeriodSeconds heartbeats to evaluate health. + // container is started (after image pull, etc). InitialDelaySeconds: gs.Spec.Health.InitialDelaySeconds, PeriodSeconds: gs.Spec.Health.PeriodSeconds, // By the time /gshealthz returns unhealthy, the sidecar has already evaluated - // FailureThreshold in a row failed health checks, so on the kubelet side, one - // failure is sufficient to know the game server is unhealthy. - FailureThreshold: 1, + // {FailureThreshold in a row} failed health checks, so in theory on the kubelet + // side, one failure is sufficient to know the game server is unhealthy. However, + // with only one failure, if the sidecar doesn't come up at all, we unnecessarily + // restart the game server. So use FailureThreshold as startup wiggle-room as well. + // + // Note that in general, FailureThreshold could also be infinite - the controller + // and sidecar are responsible for health management. + FailureThreshold: gs.Spec.Health.FailureThreshold, } } diff --git a/pkg/gameservers/controller_test.go b/pkg/gameservers/controller_test.go index f4be9922e8..385542bebd 100644 --- a/pkg/gameservers/controller_test.go +++ b/pkg/gameservers/controller_test.go @@ -1056,7 +1056,7 @@ func TestControllerCreateGameServerPod(t *testing.T) { assert.Equal(t, intstr.FromInt(8080), gsContainer.LivenessProbe.HTTPGet.Port) assert.Equal(t, fixture.Spec.Health.InitialDelaySeconds, gsContainer.LivenessProbe.InitialDelaySeconds) assert.Equal(t, fixture.Spec.Health.PeriodSeconds, gsContainer.LivenessProbe.PeriodSeconds) - assert.Equal(t, int32(1), gsContainer.LivenessProbe.FailureThreshold) + assert.Equal(t, fixture.Spec.Health.FailureThreshold, gsContainer.LivenessProbe.FailureThreshold) assert.Len(t, gsContainer.VolumeMounts, 1) assert.Equal(t, "/var/run/secrets/kubernetes.io/serviceaccount", gsContainer.VolumeMounts[0].MountPath) @@ -1758,7 +1758,7 @@ func TestControllerAddGameServerHealthCheck(t *testing.T) { require.NotNil(t, probe) assert.Equal(t, "/gshealthz", probe.HTTPGet.Path) assert.Equal(t, intstr.IntOrString{IntVal: 8080}, probe.HTTPGet.Port) - assert.Equal(t, int32(1), probe.FailureThreshold) + assert.Equal(t, fixture.Spec.Health.FailureThreshold, probe.FailureThreshold) assert.Equal(t, fixture.Spec.Health.InitialDelaySeconds, probe.InitialDelaySeconds) assert.Equal(t, fixture.Spec.Health.PeriodSeconds, probe.PeriodSeconds) }