Skip to content

Commit

Permalink
test/kubernetes/testcluster: Fix WaitForDaemonset timeout case.
Browse files Browse the repository at this point in the history
Prior to this change, `WaitForDaemonset` would erroneously return a
`nil` error when the context expired.

Also make error messages in `WaitForDaemonset` more helpful.
This includes bailing out of the watch-based loop slightly before the
context expiry, and fall back to the poll-based loop which provides a
more helpful error message.

PiperOrigin-RevId: 700805708
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Nov 27, 2024
1 parent 00dcbab commit e27d27a
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 14 deletions.
1 change: 0 additions & 1 deletion test/kubernetes/k8sctx/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ go_library(
],
deps = [
"//runsc/flag",
"//test/kubernetes:test_range_config_go_proto",
"//test/kubernetes/testcluster",
"//tools/gvisor_k8s_tool/provider/kubectl",
"@org_golang_google_protobuf//encoding/prototext:go_default_library",
Expand Down
3 changes: 1 addition & 2 deletions test/kubernetes/k8sctx/k8sctx_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"google.golang.org/protobuf/encoding/prototext"
"google.golang.org/protobuf/types/known/anypb"
"gvisor.dev/gvisor/runsc/flag"
testpb "gvisor.dev/gvisor/test/kubernetes/test_range_config_go_proto"
"gvisor.dev/gvisor/test/kubernetes/testcluster"
"gvisor.dev/gvisor/tools/gvisor_k8s_tool/provider/kubectl"
)
Expand Down Expand Up @@ -63,7 +62,7 @@ func newKubectlContext(ctx context.Context) (KubernetesContext, error) {
if err = prototext.Unmarshal(clusterBytes, &clusterPB); err != nil {
return nil, fmt.Errorf("cannot unmarshal cluster textproto file %q: %w", *clusterProtoPath, err)
}
testCluster := testcluster.NewTestClusterWithClient(&testpb.Cluster{Cluster: &clusterPB}, cluster.Client())
testCluster := testcluster.NewTestClusterFromClient(*kubectlContextName, cluster.Client())
if *testNodepoolRuntime != "" {
testCluster.OverrideTestNodepoolRuntime(testcluster.RuntimeType(*testNodepoolRuntime))
}
Expand Down
39 changes: 28 additions & 11 deletions test/kubernetes/testcluster/testcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -741,27 +741,44 @@ func (t *TestCluster) WaitForDaemonset(ctx context.Context, ds *appsv1.DaemonSet
defer w.Stop()
var lastDS *appsv1.DaemonSet

for daemonSetReady := false; !daemonSetReady; {
// Wait up to 15 seconds before the context deadline.
// We don't wait all the way up to the context deadline, because we
// want to have enough time to do a manual poll after the watch loop
// in order to get the current state of the DaemonSet and provide a
// better error message in case the API server is down or slow.
const watchTimeoutEarly = 15 * time.Second
readyCtx := ctx
ctxDeadline, hasDeadline := readyCtx.Deadline()
if hasDeadline && ctxDeadline.Sub(time.Now()) > watchTimeoutEarly {
var cancel context.CancelFunc
readyCtx, cancel = context.WithDeadline(ctx, ctxDeadline.Add(-watchTimeoutEarly))
defer cancel()
}

watchLoop:
for {
select {
case <-ctx.Done():
if lastDS != nil {
return fmt.Errorf("context canceled before healthy; last DaemonSet status: %#v", lastDS.Status)
}
return fmt.Errorf("context canceled before healthy")
case <-readyCtx.Done():
// Give up the watch loop. Most likely the API server is down or the
// DaemonSet can't schedule. Either way, the poll-based loop below
// will provide a better error message.
log.Infof("Watching DaemonSet %q: watch loop timed out (%v); falling back to poll-based loop.", ds.GetName(), readyCtx.Err())
break watchLoop
case e, ok := <-w.ResultChan():
d, ok := e.Object.(*appsv1.DaemonSet)
if !ok {
return fmt.Errorf("invalid object type: %T", d)
}
lastDS = d
if d.Status.NumberReady == d.Status.DesiredNumberScheduled && d.Status.DesiredNumberScheduled > 0 && d.Status.NumberUnavailable == 0 {
daemonSetReady = true
break watchLoop
}
}
}

// Now wait for the pods to be running.
for ctx.Err() == nil {
// Poll-based loop to wait for the DaemonSet to be ready.
var lastBadPod v13.Pod
for {
pods, err := t.GetPodsInDaemonSet(ctx, ds)
if err != nil {
return fmt.Errorf("failed to get pods in daemonset: %v", err)
Expand All @@ -775,6 +792,7 @@ func (t *TestCluster) WaitForDaemonset(ctx context.Context, ds *appsv1.DaemonSet
case v13.PodRunning, v13.PodSucceeded:
// OK, do nothing.
default:
lastBadPod = pod
allOK = false
}
}
Expand All @@ -783,11 +801,10 @@ func (t *TestCluster) WaitForDaemonset(ctx context.Context, ds *appsv1.DaemonSet
}
select {
case <-ctx.Done():
return ctx.Err()
return fmt.Errorf("context expired waiting for daemonset %q: %w; last bad pod: %v", ds.GetName(), ctx.Err(), lastBadPod)
case <-time.After(100 * time.Millisecond):
}
}
return nil
}

// StreamDaemonSetLogs streams the contents of a container from the given
Expand Down

0 comments on commit e27d27a

Please sign in to comment.