Skip to content

Commit

Permalink
Close consistency race in syncGameServerRequestReadyState
Browse files Browse the repository at this point in the history
If the SDK and controller win the race to update the Pod with the
GameServerReadyContainerIDAnnotation before kubelet even gets a chance
to add the running containers to the Pod, the controller may update
the pod with an empty annotation, which then confuses further runs.

Fixes TestPlayerConnectWithCapacityZero flakes

May fully fix googleforgames#2445 as well
  • Loading branch information
zmerlynn committed Apr 5, 2023
1 parent cf12a85 commit 5ac3dc6
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 0 deletions.
4 changes: 4 additions & 0 deletions pkg/gameservers/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,10 @@ func (c *Controller) syncGameServerRequestReadyState(ctx context.Context, gs *ag
break
}
}
// Verify that we found the game server container - we may have a stale cache where pod is missing ContainerStatuses.
if _, ok := gsCopy.ObjectMeta.Annotations[agonesv1.GameServerReadyContainerIDAnnotation]; !ok {
return nil, workerqueue.NewDebugError(fmt.Errorf("game server container for GameServer %s in namespace %s not present in pod status, try again", gsCopy.ObjectMeta.Name, gsCopy.ObjectMeta.Namespace))
}

// Also update the pod with the same annotation, so we can check if the Pod data is up-to-date, now and also in the HealthController.
// But if it is already set, then ignore it, since we only need to do this one time.
Expand Down
33 changes: 33 additions & 0 deletions pkg/gameservers/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1495,6 +1495,39 @@ func TestControllerSyncGameServerRequestReadyState(t *testing.T) {
assert.False(t, podUpdated, "Pod was updated")
})

t.Run("GameServer whose pod is missing ContainerStatuses, so should retry and not update", func(t *testing.T) {
c, m := newFakeController()

gsFixture := &agonesv1.GameServer{ObjectMeta: metav1.ObjectMeta{Name: "test", Namespace: "default"},
Spec: newSingleContainerSpec(), Status: agonesv1.GameServerStatus{State: agonesv1.GameServerStateRequestReady}}
gsFixture.ApplyDefaults()
gsFixture.Status.NodeName = nodeName
pod, err := gsFixture.Pod(agtesting.FakeAPIHooks{})
assert.Nil(t, err)
gsUpdated := false
podUpdated := false

m.KubeClient.AddReactor("list", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
return true, &corev1.PodList{Items: []corev1.Pod{*pod}}, nil
})
m.AgonesClient.AddReactor("update", "gameservers", func(action k8stesting.Action) (bool, runtime.Object, error) {
gsUpdated = true
return true, nil, nil
})
m.KubeClient.AddReactor("update", "pods", func(action k8stesting.Action) (bool, runtime.Object, error) {
podUpdated = true
return true, nil, nil
})

ctx, cancel := agtesting.StartInformers(m, c.podSynced)
defer cancel()

_, err = c.syncGameServerRequestReadyState(ctx, gsFixture)
assert.EqualError(t, err, "game server container for GameServer test in namespace default not present in pod status, try again")
assert.False(t, gsUpdated, "GameServer was updated")
assert.False(t, podUpdated, "Pod was updated")
})

t.Run("GameServer with non zero deletion datetime", func(t *testing.T) {
testWithNonZeroDeletionTimestamp(t, func(c *Controller, fixture *agonesv1.GameServer) (*agonesv1.GameServer, error) {
return c.syncGameServerRequestReadyState(context.Background(), fixture)
Expand Down

0 comments on commit 5ac3dc6

Please sign in to comment.