From 6d641e190193212aa954e9d1c7f074f059d332ff Mon Sep 17 00:00:00 2001 From: Josh Wolf Date: Mon, 17 Jul 2023 09:02:19 -0400 Subject: [PATCH] add retries to a subset of k8s runner exec failures Signed-off-by: Josh Wolf --- pkg/container/kubernetes_runner.go | 44 +++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/pkg/container/kubernetes_runner.go b/pkg/container/kubernetes_runner.go index d84cc7129..a552ef762 100644 --- a/pkg/container/kubernetes_runner.go +++ b/pkg/container/kubernetes_runner.go @@ -38,6 +38,7 @@ import ( "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/tools/remotecommand" + "k8s.io/client-go/util/exec" "knative.dev/pkg/ptr" "sigs.k8s.io/yaml" @@ -253,6 +254,18 @@ func (k *k8s) OCIImageLoader() Loader { // Exec runs a command on the pod func (k *k8s) Exec(ctx context.Context, podName string, cmd []string, streamOpts remotecommand.StreamOptions) error { + // The k8s runner has no concept of a "WorkingDir", so we prepend the standard + // command to root us in WorkingDir + if len(cmd) != 3 { + k.logger.Warnf("unknown command format, expected 3 elements but got %d, this might not work...", len(cmd)) + } else if cmd[0] != "/bin/sh" || cmd[1] != "-c" { + k.logger.Warnf("unknown command format, expected '/bin/sh -c' but got [%s %s], this might not work...", cmd[0], cmd[1]) + } else { + cmd[2] = fmt.Sprintf(`[ -d '%s' ] || mkdir -p '%s' +cd '%s' +%s`, runnerWorkdir, runnerWorkdir, runnerWorkdir, cmd[2]) + } + req := k.clientset. CoreV1(). RESTClient(). @@ -268,14 +281,37 @@ func (k *k8s) Exec(ctx context.Context, podName string, cmd []string, streamOpts Stderr: true, }, scheme.ParameterCodec) - k.logger.Infof("executing command %v", cmd) - exec, err := remotecommand.NewSPDYExecutor(k.restConfig, "POST", req.URL()) + executor, err := remotecommand.NewSPDYExecutor(k.restConfig, "POST", req.URL()) if err != nil { return fmt.Errorf("failed to create remote command executor: %v", err) } - if err := exec.StreamWithContext(ctx, streamOpts); err != nil { - return fmt.Errorf("failed to stream remote command: %v", err) + // Backoff up to 4 times with a 1 second initial delay, tripling each time + backoff := wait.Backoff{ + Steps: 4, + Duration: 1 * time.Second, + Factor: 3, + Jitter: 0.1, + } + + k.logger.Infof("remote executing command %v", cmd) + if err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) { + err := executor.StreamWithContext(ctx, streamOpts) + switch e := err.(type) { + case *exec.CodeExitError, exec.ExitError: + // Non recoverable error + k.logger.Warnf("non-recoverable error (%T) executing remote command: %v", e, err) + return false, err + case nil: + // Succeeded without error + return true, nil + } + + // Everything else is retryable without altering the existing build step + k.logger.Warnf("attempting to recover (%T) after failing to execute remote command: %v", err, err) + return false, nil + }); err != nil { + return fmt.Errorf("failed executing remote command: %v", err) } return nil