Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Rework game server health initial delay handling (#3046)" #3068

Merged
merged 1 commit into from
Apr 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ steps:
for version in "${!versionsAndRegions[@]}"
do
region=${versionsAndRegions[$version]}
if [ $cloudProduct = gke-autopilot ] && [ $version = 1.26 ]
then
continue
fi
if [ $cloudProduct = generic ]
then
featureWithGate="CustomFasSyncInterval=false&SafeToEvict=false&SDKGracefulTermination=false&StateAllocationFilter=false&PlayerAllocationFilter=true&PlayerTracking=true&ResetMetricsOnDelete=true&PodHostname=true&SplitControllerAndExtensions=true&FleetAllocationOverflow=true&Example=true"
Expand Down
9 changes: 1 addition & 8 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -705,16 +705,9 @@ func (c *Controller) addGameServerHealthCheck(gs *agonesv1.GameServer, pod *core
Port: intstr.FromInt(8080),
},
},
// The sidecar relies on kubelet to delay by InitialDelaySeconds after the
// container is started (after image pull, etc), and relies on the kubelet
// for PeriodSeconds heartbeats to evaluate health.
InitialDelaySeconds: gs.Spec.Health.InitialDelaySeconds,
PeriodSeconds: gs.Spec.Health.PeriodSeconds,

// By the time /gshealthz returns unhealthy, the sidecar has already evaluated
// FailureThreshold in a row failed health checks, so on the kubelet side, one
// failure is sufficient to know the game server is unhealthy.
FailureThreshold: 1,
FailureThreshold: gs.Spec.Health.FailureThreshold,
}
}

Expand Down
4 changes: 2 additions & 2 deletions pkg/gameservers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,7 @@ func TestControllerCreateGameServerPod(t *testing.T) {
assert.Equal(t, intstr.FromInt(8080), gsContainer.LivenessProbe.HTTPGet.Port)
assert.Equal(t, fixture.Spec.Health.InitialDelaySeconds, gsContainer.LivenessProbe.InitialDelaySeconds)
assert.Equal(t, fixture.Spec.Health.PeriodSeconds, gsContainer.LivenessProbe.PeriodSeconds)
assert.Equal(t, int32(1), gsContainer.LivenessProbe.FailureThreshold)
assert.Equal(t, fixture.Spec.Health.FailureThreshold, gsContainer.LivenessProbe.FailureThreshold)
assert.Len(t, gsContainer.VolumeMounts, 1)
assert.Equal(t, "/var/run/secrets/kubernetes.io/serviceaccount", gsContainer.VolumeMounts[0].MountPath)

Expand Down Expand Up @@ -1730,7 +1730,7 @@ func TestControllerAddGameServerHealthCheck(t *testing.T) {
require.NotNil(t, probe)
assert.Equal(t, "/gshealthz", probe.HTTPGet.Path)
assert.Equal(t, intstr.IntOrString{IntVal: 8080}, probe.HTTPGet.Port)
assert.Equal(t, int32(1), probe.FailureThreshold)
assert.Equal(t, fixture.Spec.Health.FailureThreshold, probe.FailureThreshold)
assert.Equal(t, fixture.Spec.Health.InitialDelaySeconds, probe.InitialDelaySeconds)
assert.Equal(t, fixture.Spec.Health.PeriodSeconds, probe.PeriodSeconds)
}
Expand Down
38 changes: 20 additions & 18 deletions pkg/sdkserver/sdkserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import (
apiequality "k8s.io/apimachinery/pkg/api/equality"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
k8sv1 "k8s.io/client-go/kubernetes/typed/core/v1"
Expand Down Expand Up @@ -163,7 +164,7 @@ func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interf
}
})
mux.HandleFunc("/gshealthz", func(w http.ResponseWriter, r *http.Request) {
if s.runHealth() {
if s.healthy() {
_, err := w.Write([]byte("ok"))
if err != nil {
s.logger.WithError(err).Error("could not send ok response on gshealthz")
Expand All @@ -187,6 +188,12 @@ func NewSDKServer(gameServerName, namespace string, kubeClient kubernetes.Interf
return s, nil
}

// initHealthLastUpdated adds the initial delay to now, then it will always be after `now`
// until the delay passes
func (s *SDKServer) initHealthLastUpdated(healthInitialDelay time.Duration) {
s.healthLastUpdated = s.clock.Now().UTC().Add(healthInitialDelay)
}

// Run processes the rate limited queue.
// Will block until stop is closed
func (s *SDKServer) Run(ctx context.Context) error {
Expand Down Expand Up @@ -222,7 +229,7 @@ func (s *SDKServer) Run(ctx context.Context) error {
s.health = gs.Spec.Health
s.logger.WithField("health", s.health).Debug("Setting health configuration")
s.healthTimeout = time.Duration(gs.Spec.Health.PeriodSeconds) * time.Second
s.touchHealthLastUpdated()
s.initHealthLastUpdated(time.Duration(gs.Spec.Health.InitialDelaySeconds) * time.Second)

if gs.Status.State == agonesv1.GameServerStateReserved && gs.Status.ReservedUntil != nil {
s.gsUpdateMutex.Lock()
Expand All @@ -233,6 +240,7 @@ func (s *SDKServer) Run(ctx context.Context) error {
// start health checking running
if !s.health.Disabled {
s.logger.Debug("Starting GameServer health checking")
go wait.Until(s.runHealth, s.healthTimeout, ctx.Done())
}

// populate player tracking values
Expand Down Expand Up @@ -423,10 +431,7 @@ func (s *SDKServer) updateAnnotations(ctx context.Context) error {
// workerqueue
func (s *SDKServer) enqueueState(state agonesv1.GameServerState) {
s.gsUpdateMutex.Lock()
// Update cached state, but prevent transitions out of `Unhealthy` by the SDK.
if s.gsState != agonesv1.GameServerStateUnhealthy {
s.gsState = state
}
s.gsState = state
s.gsUpdateMutex.Unlock()
s.workerqueue.Enqueue(cache.ExplicitKey(string(updateState)))
}
Expand Down Expand Up @@ -809,17 +814,15 @@ func (s *SDKServer) sendGameServerUpdate(gs *agonesv1.GameServer) {
}
}

// runHealth checks the health as part of the /gshealthz hook, and if not
// healthy will push the Unhealthy state into the queue so it can be updated.
// Returns current health.
func (s *SDKServer) runHealth() bool {
// runHealth actively checks the health, and if not
// healthy will push the Unhealthy state into the queue so
// it can be updated
func (s *SDKServer) runHealth() {
s.checkHealth()
if s.healthy() {
return true
if !s.healthy() {
s.logger.WithField("gameServerName", s.gameServerName).Warn("GameServer has failed health check")
s.enqueueState(agonesv1.GameServerStateUnhealthy)
}
s.logger.WithField("gameServerName", s.gameServerName).Warn("GameServer has failed health check")
s.enqueueState(agonesv1.GameServerStateUnhealthy)
return false
}

// touchHealthLastUpdated sets the healthLastUpdated
Expand All @@ -835,11 +838,10 @@ func (s *SDKServer) touchHealthLastUpdated() {
// and if it is outside the timeout value, logger and
// count a failure
func (s *SDKServer) checkHealth() {
s.healthMutex.Lock()
defer s.healthMutex.Unlock()

timeout := s.healthLastUpdated.Add(s.healthTimeout)
if timeout.Before(s.clock.Now().UTC()) {
s.healthMutex.Lock()
defer s.healthMutex.Unlock()
s.healthFailureCount++
s.logger.WithField("failureCount", s.healthFailureCount).Warn("GameServer Health Check failed")
}
Expand Down
32 changes: 14 additions & 18 deletions pkg/sdkserver/sdkserver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,8 @@ func TestSidecarRun(t *testing.T) {
},
"unhealthy": {
f: func(sc *SDKServer, ctx context.Context) {
time.Sleep(1 * time.Second)
sc.runHealth() // normally invoked from /gshealthz handler
time.Sleep(2 * time.Second) // exceed 1s timeout
sc.runHealth() // normally invoked from /gshealthz handler
// we have a 1 second timeout
time.Sleep(2 * time.Second)
},
expected: expected{
state: agonesv1.GameServerStateUnhealthy,
Expand Down Expand Up @@ -477,12 +475,6 @@ func TestSidecarUnhealthyMessage(t *testing.T) {
// manually push through an unhealthy state change
sc.enqueueState(agonesv1.GameServerStateUnhealthy)
agtesting.AssertEventContains(t, m.FakeRecorder.Events, "Health check failure")

// try to push back to Ready, enqueueState should block it.
sc.enqueueState(agonesv1.GameServerStateRequestReady)
sc.gsUpdateMutex.Lock()
assert.Equal(t, agonesv1.GameServerStateUnhealthy, sc.gsState)
sc.gsUpdateMutex.Unlock()
}

func TestSidecarHealthy(t *testing.T) {
Expand All @@ -495,7 +487,7 @@ func TestSidecarHealthy(t *testing.T) {
// manually set the values
sc.health = agonesv1.Health{FailureThreshold: 1}
sc.healthTimeout = 5 * time.Second
sc.touchHealthLastUpdated()
sc.initHealthLastUpdated(0 * time.Second)

now := time.Now().UTC()
fc := testclocks.NewFakeClock(now)
Expand Down Expand Up @@ -540,12 +532,15 @@ func TestSidecarHealthy(t *testing.T) {
t.Run("initial delay", func(t *testing.T) {
sc.health.Disabled = false
fc.SetTime(time.Now().UTC())
sc.touchHealthLastUpdated()
sc.initHealthLastUpdated(0)
sc.healthFailureCount = 0
sc.checkHealth()
assert.True(t, sc.healthy())

// initial delay is handled by kubelet, runHealth() isn't
// called until container starts.
fc.Step(10 * time.Second)
sc.touchHealthLastUpdated()
sc.initHealthLastUpdated(10 * time.Second)
sc.checkHealth()
assert.True(t, sc.healthy())
fc.Step(9 * time.Second)
sc.checkHealth()
assert.True(t, sc.healthy())

Expand All @@ -558,7 +553,8 @@ func TestSidecarHealthy(t *testing.T) {
sc.health.Disabled = false
sc.health.FailureThreshold = 3
fc.SetTime(time.Now().UTC())
sc.touchHealthLastUpdated()
sc.initHealthLastUpdated(0)
sc.healthFailureCount = 0

sc.checkHealth()
assert.True(t, sc.healthy())
Expand Down Expand Up @@ -626,8 +622,8 @@ func TestSidecarHTTPHealthCheck(t *testing.T) {

fc.Step(step)
time.Sleep(step)
testHTTPHealth(t, "http://localhost:8080/gshealthz", "", http.StatusInternalServerError) // force runHealth to run
assert.False(t, sc.healthy())
testHTTPHealth(t, "http://localhost:8080/gshealthz", "", http.StatusInternalServerError)
cancel()
wg.Wait() // wait for go routine test results.
}
Expand Down
41 changes: 0 additions & 41 deletions test/e2e/gameserver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1253,44 +1253,3 @@ func TestGracefulShutdown(t *testing.T) {
log.WithField("diff", diff).Info("Time difference")
require.Less(t, diff, 40)
}

func TestGameServerSlowStart(t *testing.T) {
t.Parallel()

// Inject an additional game server sidecar that forces a delayed start
// to the main game server container following the pattern at
// https://medium.com/@marko.luksa/delaying-application-start-until-sidecar-is-ready-2ec2d21a7b74
gs := framework.DefaultGameServer(framework.Namespace)
gs.Spec.Template.Spec.Containers = append(
[]corev1.Container{{
Name: "delay-game-server-start",
Image: "alpine:latest",
ImagePullPolicy: corev1.PullIfNotPresent,
Command: []string{"sleep", "3600"},
Lifecycle: &corev1.Lifecycle{
PostStart: &corev1.LifecycleHandler{
Exec: &corev1.ExecAction{
Command: []string{"sleep", "60"},
},
},
},
Resources: corev1.ResourceRequirements{
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("30m"),
corev1.ResourceMemory: resource.MustParse("64Mi"),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("30m"),
corev1.ResourceMemory: resource.MustParse("64Mi"),
},
},
}},
gs.Spec.Template.Spec.Containers...)

// Validate that a game server whose primary container starts slowly (a full minute
// after the SDK starts) is capable of reaching Ready. Here we force the condition
// with a lifecycle hook, but it imitates a slow image pull, or other container
// start delays.
_, err := framework.CreateGameServerAndWaitUntilReady(t, framework.Namespace, gs)
assert.NoError(t, err)
}