From 9897003f19bb4a41fd57b2e73e036b0e81775fad Mon Sep 17 00:00:00 2001 From: Zach Loafman <zml@google.com> Date: Tue, 4 Apr 2023 19:57:27 +0000 Subject: [PATCH] Revert "Rework game server health initial delay handling (#3046)" Seeing a high incidence of SDK sidecar failures on Autopilot e2es, looking at the logs. Reverting for now as it seems related to this PR. This reverts commit 60ed8cdac3def2b41f77adc2962c7ac65279bff9. --- cloudbuild.yaml | 4 +++ pkg/gameservers/controller.go | 9 +------ pkg/gameservers/controller_test.go | 4 +-- pkg/sdkserver/sdkserver.go | 38 ++++++++++++++------------- pkg/sdkserver/sdkserver_test.go | 32 ++++++++++------------- test/e2e/gameserver_test.go | 41 ------------------------------ 6 files changed, 41 insertions(+), 87 deletions(-) diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 548fa2d541..f06d8c5b56 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -307,6 +307,10 @@ steps: for version in "${!versionsAndRegions[@]}" do region=${versionsAndRegions[$version]} + if [ $cloudProduct = gke-autopilot ] && [ $version = 1.26 ] + then + continue + fi if [ $cloudProduct = generic ] then featureWithGate="CustomFasSyncInterval=false&SafeToEvict=false&SDKGracefulTermination=false&StateAllocationFilter=false&PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&PodHostname=true&SplitControllerAndExtensions=true&FleetAllocationOverflow=true&Example=true" diff --git a/pkg/gameservers/controller.go b/pkg/gameservers/controller.go index fde5019556..73184c5d7c 100644 --- a/pkg/gameservers/controller.go +++ b/pkg/gameservers/controller.go @@ -705,16 +705,9 @@ func (c *Controller) addGameServerHealthCheck(gs *agonesv1.GameServer, pod *core Port: intstr.FromInt(8080), }, }, - // The sidecar relies on kubelet to delay by InitialDelaySeconds after the - // container is started (after image pull, etc), and relies on the kubelet - // for PeriodSeconds heartbeats to evaluate health. InitialDelaySeconds: gs.Spec.Health.InitialDelaySeconds, PeriodSeconds: gs.Spec.Health.PeriodSeconds, - - // By the time /gshealthz returns unhealthy, the sidecar has already evaluated - // FailureThreshold in a row failed health checks, so on the kubelet side, one - // failure is sufficient to know the game server is unhealthy. - FailureThreshold: 1, + FailureThreshold: gs.Spec.Health.FailureThreshold, } } diff --git a/pkg/gameservers/controller_test.go b/pkg/gameservers/controller_test.go index 9b3ffe8d32..e4c8dde313 100644 --- a/pkg/gameservers/controller_test.go +++ b/pkg/gameservers/controller_test.go @@ -1061,7 +1061,7 @@ func TestControllerCreateGameServerPod(t *testing.T) { assert.Equal(t, intstr.FromInt(8080), gsContainer.LivenessProbe.HTTPGet.Port) assert.Equal(t, fixture.Spec.Health.InitialDelaySeconds, gsContainer.LivenessProbe.InitialDelaySeconds) assert.Equal(t, fixture.Spec.Health.PeriodSeconds, gsContainer.LivenessProbe.PeriodSeconds) - assert.Equal(t, int32(1), gsContainer.LivenessProbe.FailureThreshold) + assert.Equal(t, fixture.Spec.Health.FailureThreshold, gsContainer.LivenessProbe.FailureThreshold) assert.Len(t, gsContainer.VolumeMounts, 1) assert.Equal(t, "/var/run/secrets/kubernetes.io/serviceaccount", gsContainer.VolumeMounts[0].MountPath) @@ -1730,7 +1730,7 @@ func TestControllerAddGameServerHealthCheck(t *testing.T) { require.NotNil(t, probe) assert.Equal(t, "/gshealthz", probe.HTTPGet.Path) assert.Equal(t, intstr.IntOrString{IntVal: 8080}, probe.HTTPGet.Port) - assert.Equal(t, int32(1), probe.FailureThreshold) + assert.Equal(t, fixture.Spec.Health.FailureThreshold, probe.FailureThreshold) assert.Equal(t, fixture.Spec.Health.InitialDelaySeconds, probe.InitialDelaySeconds) assert.Equal(t, fixture.Spec.Health.PeriodSeconds, probe.PeriodSeconds) } diff --git a/pkg/sdkserver/sdkserver.go b/pkg/sdkserver/sdkserver.go index b751b6033b..80f66f3b04 100644 --- a/pkg/sdkserver/sdkserver.go +++ b/pkg/sdkserver/sdkserver.go @@ -42,6 +42,7 @@ import ( apiequality "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" k8sv1 "k8s.io/client-go/kubernetes/typed/core/v1" @@ -163,7 +164,7 @@ func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interf } }) mux.HandleFunc("/gshealthz", func(w http.ResponseWriter, r *http.Request) { - if s.runHealth() { + if s.healthy() { _, err := w.Write([]byte("ok")) if err != nil { s.logger.WithError(err).Error("could not send ok response on gshealthz") @@ -187,6 +188,12 @@ func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interf return s, nil } +// initHealthLastUpdated adds the initial delay to now, then it will always be after `now` +// until the delay passes +func (s *SDKServer) initHealthLastUpdated(healthInitialDelay time.Duration) { + s.healthLastUpdated = s.clock.Now().UTC().Add(healthInitialDelay) +} + // Run processes the rate limited queue. // Will block until stop is closed func (s *SDKServer) Run(ctx context.Context) error { @@ -222,7 +229,7 @@ func (s *SDKServer) Run(ctx context.Context) error { s.health = gs.Spec.Health s.logger.WithField("health", s.health).Debug("Setting health configuration") s.healthTimeout = time.Duration(gs.Spec.Health.PeriodSeconds) * time.Second - s.touchHealthLastUpdated() + s.initHealthLastUpdated(time.Duration(gs.Spec.Health.InitialDelaySeconds) * time.Second) if gs.Status.State == agonesv1.GameServerStateReserved && gs.Status.ReservedUntil != nil { s.gsUpdateMutex.Lock() @@ -233,6 +240,7 @@ func (s *SDKServer) Run(ctx context.Context) error { // start health checking running if !s.health.Disabled { s.logger.Debug("Starting GameServer health checking") + go wait.Until(s.runHealth, s.healthTimeout, ctx.Done()) } // populate player tracking values @@ -423,10 +431,7 @@ func (s *SDKServer) updateAnnotations(ctx context.Context) error { // workerqueue func (s *SDKServer) enqueueState(state agonesv1.GameServerState) { s.gsUpdateMutex.Lock() - // Update cached state, but prevent transitions out of `Unhealthy` by the SDK. - if s.gsState != agonesv1.GameServerStateUnhealthy { - s.gsState = state - } + s.gsState = state s.gsUpdateMutex.Unlock() s.workerqueue.Enqueue(cache.ExplicitKey(string(updateState))) } @@ -809,17 +814,15 @@ func (s *SDKServer) sendGameServerUpdate(gs *agonesv1.GameServer) { } } -// runHealth checks the health as part of the /gshealthz hook, and if not -// healthy will push the Unhealthy state into the queue so it can be updated. -// Returns current health. -func (s *SDKServer) runHealth() bool { +// runHealth actively checks the health, and if not +// healthy will push the Unhealthy state into the queue so +// it can be updated +func (s *SDKServer) runHealth() { s.checkHealth() - if s.healthy() { - return true + if !s.healthy() { + s.logger.WithField("gameServerName", s.gameServerName).Warn("GameServer has failed health check") + s.enqueueState(agonesv1.GameServerStateUnhealthy) } - s.logger.WithField("gameServerName", s.gameServerName).Warn("GameServer has failed health check") - s.enqueueState(agonesv1.GameServerStateUnhealthy) - return false } // touchHealthLastUpdated sets the healthLastUpdated @@ -835,11 +838,10 @@ func (s *SDKServer) touchHealthLastUpdated() { // and if it is outside the timeout value, logger and // count a failure func (s *SDKServer) checkHealth() { - s.healthMutex.Lock() - defer s.healthMutex.Unlock() - timeout := s.healthLastUpdated.Add(s.healthTimeout) if timeout.Before(s.clock.Now().UTC()) { + s.healthMutex.Lock() + defer s.healthMutex.Unlock() s.healthFailureCount++ s.logger.WithField("failureCount", s.healthFailureCount).Warn("GameServer Health Check failed") } diff --git a/pkg/sdkserver/sdkserver_test.go b/pkg/sdkserver/sdkserver_test.go index d3b04cc602..907209271b 100644 --- a/pkg/sdkserver/sdkserver_test.go +++ b/pkg/sdkserver/sdkserver_test.go @@ -80,10 +80,8 @@ func TestSidecarRun(t *testing.T) { }, "unhealthy": { f: func(sc *SDKServer, ctx context.Context) { - time.Sleep(1 * time.Second) - sc.runHealth() // normally invoked from /gshealthz handler - time.Sleep(2 * time.Second) // exceed 1s timeout - sc.runHealth() // normally invoked from /gshealthz handler + // we have a 1 second timeout + time.Sleep(2 * time.Second) }, expected: expected{ state: agonesv1.GameServerStateUnhealthy, @@ -477,12 +475,6 @@ func TestSidecarUnhealthyMessage(t *testing.T) { // manually push through an unhealthy state change sc.enqueueState(agonesv1.GameServerStateUnhealthy) agtesting.AssertEventContains(t, m.FakeRecorder.Events, "Health check failure") - - // try to push back to Ready, enqueueState should block it. - sc.enqueueState(agonesv1.GameServerStateRequestReady) - sc.gsUpdateMutex.Lock() - assert.Equal(t, agonesv1.GameServerStateUnhealthy, sc.gsState) - sc.gsUpdateMutex.Unlock() } func TestSidecarHealthy(t *testing.T) { @@ -495,7 +487,7 @@ func TestSidecarHealthy(t *testing.T) { // manually set the values sc.health = agonesv1.Health{FailureThreshold: 1} sc.healthTimeout = 5 * time.Second - sc.touchHealthLastUpdated() + sc.initHealthLastUpdated(0 * time.Second) now := time.Now().UTC() fc := testclocks.NewFakeClock(now) @@ -540,12 +532,15 @@ func TestSidecarHealthy(t *testing.T) { t.Run("initial delay", func(t *testing.T) { sc.health.Disabled = false fc.SetTime(time.Now().UTC()) - sc.touchHealthLastUpdated() + sc.initHealthLastUpdated(0) + sc.healthFailureCount = 0 + sc.checkHealth() + assert.True(t, sc.healthy()) - // initial delay is handled by kubelet, runHealth() isn't - // called until container starts. - fc.Step(10 * time.Second) - sc.touchHealthLastUpdated() + sc.initHealthLastUpdated(10 * time.Second) + sc.checkHealth() + assert.True(t, sc.healthy()) + fc.Step(9 * time.Second) sc.checkHealth() assert.True(t, sc.healthy()) @@ -558,7 +553,8 @@ func TestSidecarHealthy(t *testing.T) { sc.health.Disabled = false sc.health.FailureThreshold = 3 fc.SetTime(time.Now().UTC()) - sc.touchHealthLastUpdated() + sc.initHealthLastUpdated(0) + sc.healthFailureCount = 0 sc.checkHealth() assert.True(t, sc.healthy()) @@ -626,8 +622,8 @@ func TestSidecarHTTPHealthCheck(t *testing.T) { fc.Step(step) time.Sleep(step) - testHTTPHealth(t, "http://localhost:8080/gshealthz", "", http.StatusInternalServerError) // force runHealth to run assert.False(t, sc.healthy()) + testHTTPHealth(t, "http://localhost:8080/gshealthz", "", http.StatusInternalServerError) cancel() wg.Wait() // wait for go routine test results. } diff --git a/test/e2e/gameserver_test.go b/test/e2e/gameserver_test.go index f9ea9d92eb..56a9644240 100644 --- a/test/e2e/gameserver_test.go +++ b/test/e2e/gameserver_test.go @@ -1253,44 +1253,3 @@ func TestGracefulShutdown(t *testing.T) { log.WithField("diff", diff).Info("Time difference") require.Less(t, diff, 40) } - -func TestGameServerSlowStart(t *testing.T) { - t.Parallel() - - // Inject an additional game server sidecar that forces a delayed start - // to the main game server container following the pattern at - // https://medium.com/@marko.luksa/delaying-application-start-until-sidecar-is-ready-2ec2d21a7b74 - gs := framework.DefaultGameServer(framework.Namespace) - gs.Spec.Template.Spec.Containers = append( - []corev1.Container{{ - Name: "delay-game-server-start", - Image: "alpine:latest", - ImagePullPolicy: corev1.PullIfNotPresent, - Command: []string{"sleep", "3600"}, - Lifecycle: &corev1.Lifecycle{ - PostStart: &corev1.LifecycleHandler{ - Exec: &corev1.ExecAction{ - Command: []string{"sleep", "60"}, - }, - }, - }, - Resources: corev1.ResourceRequirements{ - Requests: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("30m"), - corev1.ResourceMemory: resource.MustParse("64Mi"), - }, - Limits: corev1.ResourceList{ - corev1.ResourceCPU: resource.MustParse("30m"), - corev1.ResourceMemory: resource.MustParse("64Mi"), - }, - }, - }}, - gs.Spec.Template.Spec.Containers...) - - // Validate that a game server whose primary container starts slowly (a full minute - // after the SDK starts) is capable of reaching Ready. Here we force the condition - // with a lifecycle hook, but it imitates a slow image pull, or other container - // start delays. - _, err := framework.CreateGameServerAndWaitUntilReady(t, framework.Namespace, gs) - assert.NoError(t, err) -}