From 9897003f19bb4a41fd57b2e73e036b0e81775fad Mon Sep 17 00:00:00 2001
From: Zach Loafman <zml@google.com>
Date: Tue, 4 Apr 2023 19:57:27 +0000
Subject: [PATCH] Revert "Rework game server health initial delay handling
 (#3046)"

Seeing a high incidence of SDK sidecar failures on Autopilot e2es,
looking at the logs. Reverting for now as it seems related to this PR.

This reverts commit 60ed8cdac3def2b41f77adc2962c7ac65279bff9.
---
 cloudbuild.yaml                    |  4 +++
 pkg/gameservers/controller.go      |  9 +------
 pkg/gameservers/controller_test.go |  4 +--
 pkg/sdkserver/sdkserver.go         | 38 ++++++++++++++-------------
 pkg/sdkserver/sdkserver_test.go    | 32 ++++++++++-------------
 test/e2e/gameserver_test.go        | 41 ------------------------------
 6 files changed, 41 insertions(+), 87 deletions(-)

diff --git a/cloudbuild.yaml b/cloudbuild.yaml
index 548fa2d541..f06d8c5b56 100644
--- a/cloudbuild.yaml
+++ b/cloudbuild.yaml
@@ -307,6 +307,10 @@ steps:
         for version in "${!versionsAndRegions[@]}"
         do
           region=${versionsAndRegions[$version]}
+          if [ $cloudProduct = gke-autopilot ] && [ $version = 1.26 ]
+          then
+            continue
+          fi
           if [ $cloudProduct = generic ]
           then
             featureWithGate="CustomFasSyncInterval=false&SafeToEvict=false&SDKGracefulTermination=false&StateAllocationFilter=false&PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&PodHostname=true&SplitControllerAndExtensions=true&FleetAllocationOverflow=true&Example=true"
diff --git a/pkg/gameservers/controller.go b/pkg/gameservers/controller.go
index fde5019556..73184c5d7c 100644
--- a/pkg/gameservers/controller.go
+++ b/pkg/gameservers/controller.go
@@ -705,16 +705,9 @@ func (c *Controller) addGameServerHealthCheck(gs *agonesv1.GameServer, pod *core
 						Port: intstr.FromInt(8080),
 					},
 				},
-				// The sidecar relies on kubelet to delay by InitialDelaySeconds after the
-				// container is started (after image pull, etc), and relies on the kubelet
-				// for PeriodSeconds heartbeats to evaluate health.
 				InitialDelaySeconds: gs.Spec.Health.InitialDelaySeconds,
 				PeriodSeconds:       gs.Spec.Health.PeriodSeconds,
-
-				// By the time /gshealthz returns unhealthy, the sidecar has already evaluated
-				// FailureThreshold in a row failed health checks, so on the kubelet side, one
-				// failure is sufficient to know the game server is unhealthy.
-				FailureThreshold: 1,
+				FailureThreshold:    gs.Spec.Health.FailureThreshold,
 			}
 		}
 
diff --git a/pkg/gameservers/controller_test.go b/pkg/gameservers/controller_test.go
index 9b3ffe8d32..e4c8dde313 100644
--- a/pkg/gameservers/controller_test.go
+++ b/pkg/gameservers/controller_test.go
@@ -1061,7 +1061,7 @@ func TestControllerCreateGameServerPod(t *testing.T) {
 			assert.Equal(t, intstr.FromInt(8080), gsContainer.LivenessProbe.HTTPGet.Port)
 			assert.Equal(t, fixture.Spec.Health.InitialDelaySeconds, gsContainer.LivenessProbe.InitialDelaySeconds)
 			assert.Equal(t, fixture.Spec.Health.PeriodSeconds, gsContainer.LivenessProbe.PeriodSeconds)
-			assert.Equal(t, int32(1), gsContainer.LivenessProbe.FailureThreshold)
+			assert.Equal(t, fixture.Spec.Health.FailureThreshold, gsContainer.LivenessProbe.FailureThreshold)
 			assert.Len(t, gsContainer.VolumeMounts, 1)
 			assert.Equal(t, "/var/run/secrets/kubernetes.io/serviceaccount", gsContainer.VolumeMounts[0].MountPath)
 
@@ -1730,7 +1730,7 @@ func TestControllerAddGameServerHealthCheck(t *testing.T) {
 	require.NotNil(t, probe)
 	assert.Equal(t, "/gshealthz", probe.HTTPGet.Path)
 	assert.Equal(t, intstr.IntOrString{IntVal: 8080}, probe.HTTPGet.Port)
-	assert.Equal(t, int32(1), probe.FailureThreshold)
+	assert.Equal(t, fixture.Spec.Health.FailureThreshold, probe.FailureThreshold)
 	assert.Equal(t, fixture.Spec.Health.InitialDelaySeconds, probe.InitialDelaySeconds)
 	assert.Equal(t, fixture.Spec.Health.PeriodSeconds, probe.PeriodSeconds)
 }
diff --git a/pkg/sdkserver/sdkserver.go b/pkg/sdkserver/sdkserver.go
index b751b6033b..80f66f3b04 100644
--- a/pkg/sdkserver/sdkserver.go
+++ b/pkg/sdkserver/sdkserver.go
@@ -42,6 +42,7 @@ import (
 	apiequality "k8s.io/apimachinery/pkg/api/equality"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/fields"
+	"k8s.io/apimachinery/pkg/util/wait"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/kubernetes/scheme"
 	k8sv1 "k8s.io/client-go/kubernetes/typed/core/v1"
@@ -163,7 +164,7 @@ func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interf
 		}
 	})
 	mux.HandleFunc("/gshealthz", func(w http.ResponseWriter, r *http.Request) {
-		if s.runHealth() {
+		if s.healthy() {
 			_, err := w.Write([]byte("ok"))
 			if err != nil {
 				s.logger.WithError(err).Error("could not send ok response on gshealthz")
@@ -187,6 +188,12 @@ func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interf
 	return s, nil
 }
 
+// initHealthLastUpdated adds the initial delay to now, then it will always be after `now`
+// until the delay passes
+func (s *SDKServer) initHealthLastUpdated(healthInitialDelay time.Duration) {
+	s.healthLastUpdated = s.clock.Now().UTC().Add(healthInitialDelay)
+}
+
 // Run processes the rate limited queue.
 // Will block until stop is closed
 func (s *SDKServer) Run(ctx context.Context) error {
@@ -222,7 +229,7 @@ func (s *SDKServer) Run(ctx context.Context) error {
 	s.health = gs.Spec.Health
 	s.logger.WithField("health", s.health).Debug("Setting health configuration")
 	s.healthTimeout = time.Duration(gs.Spec.Health.PeriodSeconds) * time.Second
-	s.touchHealthLastUpdated()
+	s.initHealthLastUpdated(time.Duration(gs.Spec.Health.InitialDelaySeconds) * time.Second)
 
 	if gs.Status.State == agonesv1.GameServerStateReserved && gs.Status.ReservedUntil != nil {
 		s.gsUpdateMutex.Lock()
@@ -233,6 +240,7 @@ func (s *SDKServer) Run(ctx context.Context) error {
 	// start health checking running
 	if !s.health.Disabled {
 		s.logger.Debug("Starting GameServer health checking")
+		go wait.Until(s.runHealth, s.healthTimeout, ctx.Done())
 	}
 
 	// populate player tracking values
@@ -423,10 +431,7 @@ func (s *SDKServer) updateAnnotations(ctx context.Context) error {
 // workerqueue
 func (s *SDKServer) enqueueState(state agonesv1.GameServerState) {
 	s.gsUpdateMutex.Lock()
-	// Update cached state, but prevent transitions out of `Unhealthy` by the SDK.
-	if s.gsState != agonesv1.GameServerStateUnhealthy {
-		s.gsState = state
-	}
+	s.gsState = state
 	s.gsUpdateMutex.Unlock()
 	s.workerqueue.Enqueue(cache.ExplicitKey(string(updateState)))
 }
@@ -809,17 +814,15 @@ func (s *SDKServer) sendGameServerUpdate(gs *agonesv1.GameServer) {
 	}
 }
 
-// runHealth checks the health as part of the /gshealthz hook, and if not
-// healthy will push the Unhealthy state into the queue so it can be updated.
-// Returns current health.
-func (s *SDKServer) runHealth() bool {
+// runHealth actively checks the health, and if not
+// healthy will push the Unhealthy state into the queue so
+// it can be updated
+func (s *SDKServer) runHealth() {
 	s.checkHealth()
-	if s.healthy() {
-		return true
+	if !s.healthy() {
+		s.logger.WithField("gameServerName", s.gameServerName).Warn("GameServer has failed health check")
+		s.enqueueState(agonesv1.GameServerStateUnhealthy)
 	}
-	s.logger.WithField("gameServerName", s.gameServerName).Warn("GameServer has failed health check")
-	s.enqueueState(agonesv1.GameServerStateUnhealthy)
-	return false
 }
 
 // touchHealthLastUpdated sets the healthLastUpdated
@@ -835,11 +838,10 @@ func (s *SDKServer) touchHealthLastUpdated() {
 // and if it is outside the timeout value, logger and
 // count a failure
 func (s *SDKServer) checkHealth() {
-	s.healthMutex.Lock()
-	defer s.healthMutex.Unlock()
-
 	timeout := s.healthLastUpdated.Add(s.healthTimeout)
 	if timeout.Before(s.clock.Now().UTC()) {
+		s.healthMutex.Lock()
+		defer s.healthMutex.Unlock()
 		s.healthFailureCount++
 		s.logger.WithField("failureCount", s.healthFailureCount).Warn("GameServer Health Check failed")
 	}
diff --git a/pkg/sdkserver/sdkserver_test.go b/pkg/sdkserver/sdkserver_test.go
index d3b04cc602..907209271b 100644
--- a/pkg/sdkserver/sdkserver_test.go
+++ b/pkg/sdkserver/sdkserver_test.go
@@ -80,10 +80,8 @@ func TestSidecarRun(t *testing.T) {
 		},
 		"unhealthy": {
 			f: func(sc *SDKServer, ctx context.Context) {
-				time.Sleep(1 * time.Second)
-				sc.runHealth()              // normally invoked from /gshealthz handler
-				time.Sleep(2 * time.Second) // exceed 1s timeout
-				sc.runHealth()              // normally invoked from /gshealthz handler
+				// we have a 1 second timeout
+				time.Sleep(2 * time.Second)
 			},
 			expected: expected{
 				state:      agonesv1.GameServerStateUnhealthy,
@@ -477,12 +475,6 @@ func TestSidecarUnhealthyMessage(t *testing.T) {
 	// manually push through an unhealthy state change
 	sc.enqueueState(agonesv1.GameServerStateUnhealthy)
 	agtesting.AssertEventContains(t, m.FakeRecorder.Events, "Health check failure")
-
-	// try to push back to Ready, enqueueState should block it.
-	sc.enqueueState(agonesv1.GameServerStateRequestReady)
-	sc.gsUpdateMutex.Lock()
-	assert.Equal(t, agonesv1.GameServerStateUnhealthy, sc.gsState)
-	sc.gsUpdateMutex.Unlock()
 }
 
 func TestSidecarHealthy(t *testing.T) {
@@ -495,7 +487,7 @@ func TestSidecarHealthy(t *testing.T) {
 	// manually set the values
 	sc.health = agonesv1.Health{FailureThreshold: 1}
 	sc.healthTimeout = 5 * time.Second
-	sc.touchHealthLastUpdated()
+	sc.initHealthLastUpdated(0 * time.Second)
 
 	now := time.Now().UTC()
 	fc := testclocks.NewFakeClock(now)
@@ -540,12 +532,15 @@ func TestSidecarHealthy(t *testing.T) {
 	t.Run("initial delay", func(t *testing.T) {
 		sc.health.Disabled = false
 		fc.SetTime(time.Now().UTC())
-		sc.touchHealthLastUpdated()
+		sc.initHealthLastUpdated(0)
+		sc.healthFailureCount = 0
+		sc.checkHealth()
+		assert.True(t, sc.healthy())
 
-		// initial delay is handled by kubelet, runHealth() isn't
-		// called until container starts.
-		fc.Step(10 * time.Second)
-		sc.touchHealthLastUpdated()
+		sc.initHealthLastUpdated(10 * time.Second)
+		sc.checkHealth()
+		assert.True(t, sc.healthy())
+		fc.Step(9 * time.Second)
 		sc.checkHealth()
 		assert.True(t, sc.healthy())
 
@@ -558,7 +553,8 @@ func TestSidecarHealthy(t *testing.T) {
 		sc.health.Disabled = false
 		sc.health.FailureThreshold = 3
 		fc.SetTime(time.Now().UTC())
-		sc.touchHealthLastUpdated()
+		sc.initHealthLastUpdated(0)
+		sc.healthFailureCount = 0
 
 		sc.checkHealth()
 		assert.True(t, sc.healthy())
@@ -626,8 +622,8 @@ func TestSidecarHTTPHealthCheck(t *testing.T) {
 
 	fc.Step(step)
 	time.Sleep(step)
-	testHTTPHealth(t, "http://localhost:8080/gshealthz", "", http.StatusInternalServerError) // force runHealth to run
 	assert.False(t, sc.healthy())
+	testHTTPHealth(t, "http://localhost:8080/gshealthz", "", http.StatusInternalServerError)
 	cancel()
 	wg.Wait() // wait for go routine test results.
 }
diff --git a/test/e2e/gameserver_test.go b/test/e2e/gameserver_test.go
index f9ea9d92eb..56a9644240 100644
--- a/test/e2e/gameserver_test.go
+++ b/test/e2e/gameserver_test.go
@@ -1253,44 +1253,3 @@ func TestGracefulShutdown(t *testing.T) {
 	log.WithField("diff", diff).Info("Time difference")
 	require.Less(t, diff, 40)
 }
-
-func TestGameServerSlowStart(t *testing.T) {
-	t.Parallel()
-
-	// Inject an additional game server sidecar that forces a delayed start
-	// to the main game server container following the pattern at
-	// https://medium.com/@marko.luksa/delaying-application-start-until-sidecar-is-ready-2ec2d21a7b74
-	gs := framework.DefaultGameServer(framework.Namespace)
-	gs.Spec.Template.Spec.Containers = append(
-		[]corev1.Container{{
-			Name:            "delay-game-server-start",
-			Image:           "alpine:latest",
-			ImagePullPolicy: corev1.PullIfNotPresent,
-			Command:         []string{"sleep", "3600"},
-			Lifecycle: &corev1.Lifecycle{
-				PostStart: &corev1.LifecycleHandler{
-					Exec: &corev1.ExecAction{
-						Command: []string{"sleep", "60"},
-					},
-				},
-			},
-			Resources: corev1.ResourceRequirements{
-				Requests: corev1.ResourceList{
-					corev1.ResourceCPU:    resource.MustParse("30m"),
-					corev1.ResourceMemory: resource.MustParse("64Mi"),
-				},
-				Limits: corev1.ResourceList{
-					corev1.ResourceCPU:    resource.MustParse("30m"),
-					corev1.ResourceMemory: resource.MustParse("64Mi"),
-				},
-			},
-		}},
-		gs.Spec.Template.Spec.Containers...)
-
-	// Validate that a game server whose primary container starts slowly (a full minute
-	// after the SDK starts) is capable of reaching Ready. Here we force the condition
-	// with a lifecycle hook, but it imitates a slow image pull, or other container
-	// start delays.
-	_, err := framework.CreateGameServerAndWaitUntilReady(t, framework.Namespace, gs)
-	assert.NoError(t, err)
-}