From 156d873a21fe6a16ca2106d8a0bb980ee5a787c2 Mon Sep 17 00:00:00 2001 From: EKS Distro PR Bot <75336432+eks-distro-pr-bot@users.noreply.github.com> Date: Fri, 14 Apr 2023 16:00:08 -0500 Subject: [PATCH] Add timeout to the e2e command (#5649) Co-authored-by: Abhinav Pandey --- internal/pkg/ssm/command.go | 36 ++++++++++++++++++----------- internal/test/e2e/artifacts.go | 6 ++--- internal/test/e2e/fluxGit.go | 6 ++--- internal/test/e2e/oidc.go | 2 +- internal/test/e2e/registryMirror.go | 4 ++-- internal/test/e2e/run.go | 11 ++++++++- internal/test/e2e/setup.go | 8 +++---- internal/test/e2e/tinkerbell.go | 2 +- 8 files changed, 46 insertions(+), 29 deletions(-) diff --git a/internal/pkg/ssm/command.go b/internal/pkg/ssm/command.go index 0db4af989a4e..dc1c130f4759 100644 --- a/internal/pkg/ssm/command.go +++ b/internal/pkg/ssm/command.go @@ -2,6 +2,8 @@ package ssm import ( "fmt" + "math" + "strconv" "time" "github.com/aws/aws-sdk-go/aws" @@ -17,9 +19,10 @@ const ssmLogGroup = "/eks-anywhere/test/e2e" var initE2EDirCommand = "mkdir -p /home/e2e/bin && cd /home/e2e" -func WaitForSSMReady(session *session.Session, instanceId string) error { +// WaitForSSMReady waits for the SSM command to be ready. +func WaitForSSMReady(session *session.Session, instanceID string, timeout time.Duration) error { err := retrier.Retry(10, 20*time.Second, func() error { - return Run(session, logr.Discard(), instanceId, "ls") + return Run(session, logr.Discard(), instanceID, "ls", timeout) }) if err != nil { return fmt.Errorf("waiting for ssm to be ready: %v", err) @@ -53,8 +56,9 @@ var nonFinalStatuses = map[string]struct{}{ ssm.CommandInvocationStatusInProgress: {}, ssm.CommandInvocationStatusDelayed: {}, ssm.CommandInvocationStatusPending: {}, } -func Run(session *session.Session, logger logr.Logger, instanceId, command string, opts ...CommandOpt) error { - o, err := RunCommand(session, logger, instanceId, command, opts...) +// Run runs the command using SSM on the instance corresponding to the instanceID. +func Run(session *session.Session, logger logr.Logger, instanceID, command string, timeout time.Duration, opts ...CommandOpt) error { + o, err := RunCommand(session, logger, instanceID, command, timeout, opts...) if err != nil { return err } @@ -65,21 +69,22 @@ func Run(session *session.Session, logger logr.Logger, instanceId, command strin return nil } -func RunCommand(session *session.Session, logger logr.Logger, instanceId, command string, opts ...CommandOpt) (*RunOutput, error) { +// RunCommand runs the command using SSM on the instance corresponding to the instanceID. +func RunCommand(session *session.Session, logger logr.Logger, instanceID, command string, timeout time.Duration, opts ...CommandOpt) (*RunOutput, error) { service := ssm.New(session) - result, err := sendCommand(service, logger, instanceId, command, opts...) + result, err := sendCommand(service, logger, instanceID, command, timeout, opts...) if err != nil { return nil, err } commandIn := &ssm.GetCommandInvocationInput{ CommandId: result.Command.CommandId, - InstanceId: aws.String(instanceId), + InstanceId: aws.String(instanceID), } // Make sure ssm send command is registered - logger.Info("Waiting for ssm command to be registered", "commandId", commandIn.CommandId, "instanceId", commandIn.InstanceId) + logger.Info("Waiting for ssm command to be registered", "commandId", commandIn.CommandId, "instanceID", commandIn.InstanceId) err = retrier.Retry(10, 5*time.Second, func() error { _, err := service.GetCommandInvocation(commandIn) if err != nil { @@ -94,7 +99,10 @@ func RunCommand(session *session.Session, logger logr.Logger, instanceId, comman logger.Info("Waiting for ssm command to finish") var commandOut *ssm.GetCommandInvocationOutput - r := retrier.New(300*time.Minute, retrier.WithMaxRetries(2160, 60*time.Second)) + + // Making the retrier wait for longer than the provided SSM timeout to make sure + // we always get the output results. + r := retrier.New(timeout+5*time.Minute, retrier.WithMaxRetries(math.MaxInt, 60*time.Second)) err = r.Retry(func() error { var err error commandOut, err = service.GetCommandInvocation(commandIn) @@ -117,11 +125,11 @@ func RunCommand(session *session.Session, logger logr.Logger, instanceId, comman return buildRunOutput(commandOut), nil } -func sendCommand(service *ssm.SSM, logger logr.Logger, instanceId, command string, opts ...CommandOpt) (*ssm.SendCommandOutput, error) { +func sendCommand(service *ssm.SSM, logger logr.Logger, instanceID, command string, timeout time.Duration, opts ...CommandOpt) (*ssm.SendCommandOutput, error) { in := &ssm.SendCommandInput{ DocumentName: aws.String("AWS-RunShellScript"), - InstanceIds: []*string{aws.String(instanceId)}, - Parameters: map[string][]*string{"commands": {aws.String(initE2EDirCommand), aws.String(command)}, "executionTimeout": {aws.String("18000")}}, + InstanceIds: []*string{aws.String(instanceID)}, + Parameters: map[string][]*string{"commands": {aws.String(initE2EDirCommand), aws.String(command)}, "executionTimeout": {aws.String(strconv.FormatFloat(timeout.Seconds(), 'f', 0, 64))}}, } for _, opt := range opts { @@ -129,7 +137,7 @@ func sendCommand(service *ssm.SSM, logger logr.Logger, instanceId, command strin } var result *ssm.SendCommandOutput - r := retrier.New(300*time.Minute, retrier.WithRetryPolicy(func(totalRetries int, err error) (retry bool, wait time.Duration) { + r := retrier.New(timeout, retrier.WithRetryPolicy(func(totalRetries int, err error) (retry bool, wait time.Duration) { if request.IsErrorThrottle(err) && totalRetries < 60 { return true, 60 * time.Second } @@ -152,7 +160,7 @@ func sendCommand(service *ssm.SSM, logger logr.Logger, instanceId, command strin if in.OutputS3BucketName != nil { logger.V(4).Info( "SSM command output to S3", "url", - fmt.Sprintf("s3://%s/%s/%s/%s/awsrunShellScript/0.awsrunShellScript/stderr", *in.OutputS3BucketName, *in.OutputS3KeyPrefix, *result.Command.CommandId, instanceId), + fmt.Sprintf("s3://%s/%s/%s/%s/awsrunShellScript/0.awsrunShellScript/stderr", *in.OutputS3BucketName, *in.OutputS3KeyPrefix, *result.Command.CommandId, instanceID), ) } diff --git a/internal/test/e2e/artifacts.go b/internal/test/e2e/artifacts.go index aeac5cf900d3..44accbe034d5 100644 --- a/internal/test/e2e/artifacts.go +++ b/internal/test/e2e/artifacts.go @@ -20,7 +20,7 @@ func (e *E2ESession) uploadGeneratedFilesFromInstance(testName string) { e.generatedArtifactsBucketPath(), testName, ).recursive().String() - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { e.logger.Error(err, "error uploading log files from instance") } else { e.logger.V(1).Info("Successfully uploaded log files to S3") @@ -34,7 +34,7 @@ func (e *E2ESession) uploadDiagnosticArchiveFromInstance(testName string) { e.generatedArtifactsBucketPath(), testName, ).recursive().exclude("*").include(bundleNameFormat).String() - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { e.logger.Error(err, "error uploading diagnostic bundle from instance") } else { e.logger.V(1).Info("Successfully uploaded diagnostic bundle files to S3") @@ -48,7 +48,7 @@ func (e *E2ESession) uploadJUnitReportFromInstance(testName string) { e.generatedArtifactsBucketPath(), testName, ).recursive().exclude("*").include(junitFile).String() - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { e.logger.Error(err, "error uploading JUnit report from instance") } else { e.logger.V(1).Info("Successfully uploaded JUnit report files to S3") diff --git a/internal/test/e2e/fluxGit.go b/internal/test/e2e/fluxGit.go index b62e1b5ca013..e650f88e8b76 100644 --- a/internal/test/e2e/fluxGit.go +++ b/internal/test/e2e/fluxGit.go @@ -82,7 +82,7 @@ func (e *E2ESession) writeFileToInstance(file fileFromBytes) error { e.logger.V(1).Info("Writing bytes to file in instance", "file", file.dstPath) command := fmt.Sprintf("echo $'%s' >> %s && chmod %d %[2]s", file.contentString(), file.dstPath, file.permission) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("writing file in instance: %v", err) } e.logger.V(1).Info("Successfully wrote file", "file", file.dstPath) @@ -94,7 +94,7 @@ func (e *E2ESession) downloadFileInInstance(file s3Files) error { e.logger.V(1).Info("Downloading from s3 in instance", "file", file.key) command := fmt.Sprintf("aws s3 cp s3://%s/%s %s && chmod %d %[3]s", e.storageBucket, file.key, file.dstPath, file.permission) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("downloading file in instance: %v", err) } e.logger.V(1).Info("Successfully downloaded file", "file", file.key) @@ -105,7 +105,7 @@ func (e *E2ESession) downloadFileInInstance(file s3Files) error { func (e *E2ESession) setUpSshAgent(privateKeyFile string) error { command := fmt.Sprintf("eval $(ssh-agent -s) ssh-add %s", privateKeyFile) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("starting SSH agent on instance: %v", err) } e.logger.V(1).Info("Successfully started SSH agent on instance") diff --git a/internal/test/e2e/oidc.go b/internal/test/e2e/oidc.go index d1553dff916f..be7b219cf7e4 100644 --- a/internal/test/e2e/oidc.go +++ b/internal/test/e2e/oidc.go @@ -117,7 +117,7 @@ func (e *E2ESession) downloadSignerKeyInInstance(folder string) (pathInInstance e.logger.V(1).Info("Downloading from s3 in instance", "key", saSignerKey) command := fmt.Sprintf("aws s3 cp s3://%s/%s ./%s", e.storageBucket, saSignerKey, saSignerPath) - if err = ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err = ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return "", fmt.Errorf("downloading signer key in instance: %v", err) } e.logger.V(1).Info("Successfully downloaded signer key") diff --git a/internal/test/e2e/registryMirror.go b/internal/test/e2e/registryMirror.go index b35d77ab7817..a63c2c0ea7d2 100644 --- a/internal/test/e2e/registryMirror.go +++ b/internal/test/e2e/registryMirror.go @@ -77,7 +77,7 @@ func (e *E2ESession) setupRegistryMirrorEnv(testRegex string) error { func (e *E2ESession) mountRegistryCert(cert string, endpoint string) error { command := fmt.Sprintf("sudo mkdir -p /etc/docker/certs.d/%s", endpoint) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("creating directory in instance: %v", err) } decodedCert, err := base64.StdEncoding.DecodeString(cert) @@ -86,7 +86,7 @@ func (e *E2ESession) mountRegistryCert(cert string, endpoint string) error { } command = fmt.Sprintf("sudo cat <> /etc/docker/certs.d/%s/ca.crt\n%s\nEOF", endpoint, string(decodedCert)) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("mounting certificate in instance: %v", err) } diff --git a/internal/test/e2e/run.go b/internal/test/e2e/run.go index 38a3c05fe857..66c15d0fc897 100644 --- a/internal/test/e2e/run.go +++ b/internal/test/e2e/run.go @@ -7,6 +7,7 @@ import ( "strconv" "strings" "sync" + "time" "github.com/aws/aws-sdk-go/aws/session" "github.com/go-logr/logr" @@ -24,6 +25,13 @@ const ( maxIPPoolSize = 10 minIPPoolSize = 1 + + // Default timeout for E2E test instance. + e2eTimeout = 300 * time.Minute + e2eSSMTimeoutPadding = 10 * time.Minute + + // Default timeout used for all SSM commands besides running the actual E2E test. + ssmTimeout = 10 * time.Minute ) type ParallelRunConf struct { @@ -205,7 +213,7 @@ func (e *E2ESession) runTests(regex string) (testCommandResult *testCommandResul command := "GOVERSION=go1.16.6 gotestsum --junitfile=junit-testing.xml --raw-command --format=standard-verbose --hide-summary=all --ignore-non-json-output-lines -- test2json -t -p e2e ./bin/e2e.test -test.v" if regex != "" { - command = fmt.Sprintf("%s -test.run \"%s\"", command, regex) + command = fmt.Sprintf("%s -test.run \"%s\" -test.timeout %s", command, regex, e2eTimeout) } command = e.commandWithEnvVars(command) @@ -217,6 +225,7 @@ func (e *E2ESession) runTests(regex string) (testCommandResult *testCommandResul e.logger.V(4), e.instanceId, command, + e2eTimeout+e2eSSMTimeoutPadding, opt, ) if err != nil { diff --git a/internal/test/e2e/setup.go b/internal/test/e2e/setup.go index d734368d3deb..ebf2bf8adbe5 100644 --- a/internal/test/e2e/setup.go +++ b/internal/test/e2e/setup.go @@ -74,7 +74,7 @@ func (e *E2ESession) setup(regex string) error { } e.logger.V(1).Info("Waiting until SSM is ready") - err = ssm.WaitForSSMReady(e.session, e.instanceId) + err = ssm.WaitForSSMReady(e.session, e.instanceId, ssmTimeout) if err != nil { return fmt.Errorf("waiting for ssm in new instance: %v", err) } @@ -184,7 +184,7 @@ func (e *E2ESession) setup(regex string) error { func (e *E2ESession) updateFSInotifyResources() error { command := fmt.Sprintf("sudo sysctl fs.inotify.max_user_watches=%v && sudo sysctl fs.inotify.max_user_instances=%v", maxUserWatches, maxUserInstances) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("updating fs inotify resources: %v", err) } e.logger.V(1).Info("Successfully updated the fs inotify user watches and instances") @@ -230,7 +230,7 @@ func (e *E2ESession) downloadRequiredFileInInstance(file string) error { e.logger.V(1).Info("Downloading from s3 in instance", "file", file) command := fmt.Sprintf("aws s3 cp s3://%s/%s/%[3]s ./bin/ && chmod 645 ./bin/%[3]s", e.storageBucket, e.jobId, file) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("downloading file in instance: %v", err) } e.logger.V(1).Info("Successfully downloaded file") @@ -251,7 +251,7 @@ func (e *E2ESession) downloadRequiredFilesInInstance() error { func (e *E2ESession) createTestNameFile(testName string) error { command := fmt.Sprintf("echo \"%s\" > %s", testName, testNameFile) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("creating test name file in instance: %v", err) } e.logger.V(1).Info("Successfully created test name file") diff --git a/internal/test/e2e/tinkerbell.go b/internal/test/e2e/tinkerbell.go index aba9e38cd754..c9b499b1d510 100644 --- a/internal/test/e2e/tinkerbell.go +++ b/internal/test/e2e/tinkerbell.go @@ -83,7 +83,7 @@ func (e *E2ESession) setTinkerbellBootstrapIPInInstance(tinkInterface string) er e.logger.V(1).Info("Setting Tinkerbell Bootstrap IP in instance") command := fmt.Sprintf("export T_TINKERBELL_BOOTSTRAP_IP=$(/sbin/ip -o -4 addr list %s | awk '{print $4}' | cut -d/ -f1) && echo T_TINKERBELL_BOOTSTRAP_IP=\"$T_TINKERBELL_BOOTSTRAP_IP\" | tee -a /etc/environment", tinkInterface) - if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil { + if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil { return fmt.Errorf("setting tinkerbell boostrap ip: %v", err) }