Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add timeout to the e2e command #5642

Merged
merged 1 commit into from
Apr 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 22 additions & 14 deletions internal/pkg/ssm/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package ssm

import (
"fmt"
"math"
"strconv"
"time"

"github.com/aws/aws-sdk-go/aws"
Expand All @@ -17,9 +19,10 @@ const ssmLogGroup = "/eks-anywhere/test/e2e"

var initE2EDirCommand = "mkdir -p /home/e2e/bin && cd /home/e2e"

func WaitForSSMReady(session *session.Session, instanceId string) error {
// WaitForSSMReady waits for the SSM command to be ready.
func WaitForSSMReady(session *session.Session, instanceID string, timeout time.Duration) error {
err := retrier.Retry(10, 20*time.Second, func() error {
return Run(session, logr.Discard(), instanceId, "ls")
return Run(session, logr.Discard(), instanceID, "ls", timeout)
})
if err != nil {
return fmt.Errorf("waiting for ssm to be ready: %v", err)
Expand Down Expand Up @@ -53,8 +56,9 @@ var nonFinalStatuses = map[string]struct{}{
ssm.CommandInvocationStatusInProgress: {}, ssm.CommandInvocationStatusDelayed: {}, ssm.CommandInvocationStatusPending: {},
}

func Run(session *session.Session, logger logr.Logger, instanceId, command string, opts ...CommandOpt) error {
o, err := RunCommand(session, logger, instanceId, command, opts...)
// Run runs the command using SSM on the instance corresponding to the instanceID.
func Run(session *session.Session, logger logr.Logger, instanceID, command string, timeout time.Duration, opts ...CommandOpt) error {
o, err := RunCommand(session, logger, instanceID, command, timeout, opts...)
if err != nil {
return err
}
Expand All @@ -65,21 +69,22 @@ func Run(session *session.Session, logger logr.Logger, instanceId, command strin
return nil
}

func RunCommand(session *session.Session, logger logr.Logger, instanceId, command string, opts ...CommandOpt) (*RunOutput, error) {
// RunCommand runs the command using SSM on the instance corresponding to the instanceID.
func RunCommand(session *session.Session, logger logr.Logger, instanceID, command string, timeout time.Duration, opts ...CommandOpt) (*RunOutput, error) {
service := ssm.New(session)

result, err := sendCommand(service, logger, instanceId, command, opts...)
result, err := sendCommand(service, logger, instanceID, command, timeout, opts...)
if err != nil {
return nil, err
}

commandIn := &ssm.GetCommandInvocationInput{
CommandId: result.Command.CommandId,
InstanceId: aws.String(instanceId),
InstanceId: aws.String(instanceID),
}

// Make sure ssm send command is registered
logger.Info("Waiting for ssm command to be registered", "commandId", commandIn.CommandId, "instanceId", commandIn.InstanceId)
logger.Info("Waiting for ssm command to be registered", "commandId", commandIn.CommandId, "instanceID", commandIn.InstanceId)
err = retrier.Retry(10, 5*time.Second, func() error {
_, err := service.GetCommandInvocation(commandIn)
if err != nil {
Expand All @@ -94,7 +99,10 @@ func RunCommand(session *session.Session, logger logr.Logger, instanceId, comman

logger.Info("Waiting for ssm command to finish")
var commandOut *ssm.GetCommandInvocationOutput
r := retrier.New(300*time.Minute, retrier.WithMaxRetries(2160, 60*time.Second))

// Making the retrier wait for longer than the provided SSM timeout to make sure
// we always get the output results.
r := retrier.New(timeout+5*time.Minute, retrier.WithMaxRetries(math.MaxInt, 60*time.Second))
err = r.Retry(func() error {
var err error
commandOut, err = service.GetCommandInvocation(commandIn)
Expand All @@ -117,19 +125,19 @@ func RunCommand(session *session.Session, logger logr.Logger, instanceId, comman
return buildRunOutput(commandOut), nil
}

func sendCommand(service *ssm.SSM, logger logr.Logger, instanceId, command string, opts ...CommandOpt) (*ssm.SendCommandOutput, error) {
func sendCommand(service *ssm.SSM, logger logr.Logger, instanceID, command string, timeout time.Duration, opts ...CommandOpt) (*ssm.SendCommandOutput, error) {
in := &ssm.SendCommandInput{
DocumentName: aws.String("AWS-RunShellScript"),
InstanceIds: []*string{aws.String(instanceId)},
Parameters: map[string][]*string{"commands": {aws.String(initE2EDirCommand), aws.String(command)}, "executionTimeout": {aws.String("18000")}},
InstanceIds: []*string{aws.String(instanceID)},
Parameters: map[string][]*string{"commands": {aws.String(initE2EDirCommand), aws.String(command)}, "executionTimeout": {aws.String(strconv.FormatFloat(timeout.Seconds(), 'f', 0, 64))}},
}

for _, opt := range opts {
opt(in)
}

var result *ssm.SendCommandOutput
r := retrier.New(300*time.Minute, retrier.WithRetryPolicy(func(totalRetries int, err error) (retry bool, wait time.Duration) {
r := retrier.New(timeout, retrier.WithRetryPolicy(func(totalRetries int, err error) (retry bool, wait time.Duration) {
if request.IsErrorThrottle(err) && totalRetries < 60 {
return true, 60 * time.Second
}
Expand All @@ -152,7 +160,7 @@ func sendCommand(service *ssm.SSM, logger logr.Logger, instanceId, command strin
if in.OutputS3BucketName != nil {
logger.V(4).Info(
"SSM command output to S3", "url",
fmt.Sprintf("s3://%s/%s/%s/%s/awsrunShellScript/0.awsrunShellScript/stderr", *in.OutputS3BucketName, *in.OutputS3KeyPrefix, *result.Command.CommandId, instanceId),
fmt.Sprintf("s3://%s/%s/%s/%s/awsrunShellScript/0.awsrunShellScript/stderr", *in.OutputS3BucketName, *in.OutputS3KeyPrefix, *result.Command.CommandId, instanceID),
)
}

Expand Down
6 changes: 3 additions & 3 deletions internal/test/e2e/artifacts.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ func (e *E2ESession) uploadGeneratedFilesFromInstance(testName string) {
e.generatedArtifactsBucketPath(), testName,
).recursive().String()

if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
e.logger.Error(err, "error uploading log files from instance")
} else {
e.logger.V(1).Info("Successfully uploaded log files to S3")
Expand All @@ -34,7 +34,7 @@ func (e *E2ESession) uploadDiagnosticArchiveFromInstance(testName string) {
e.generatedArtifactsBucketPath(), testName,
).recursive().exclude("*").include(bundleNameFormat).String()

if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
e.logger.Error(err, "error uploading diagnostic bundle from instance")
} else {
e.logger.V(1).Info("Successfully uploaded diagnostic bundle files to S3")
Expand All @@ -48,7 +48,7 @@ func (e *E2ESession) uploadJUnitReportFromInstance(testName string) {
e.generatedArtifactsBucketPath(), testName,
).recursive().exclude("*").include(junitFile).String()

if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
e.logger.Error(err, "error uploading JUnit report from instance")
} else {
e.logger.V(1).Info("Successfully uploaded JUnit report files to S3")
Expand Down
6 changes: 3 additions & 3 deletions internal/test/e2e/fluxGit.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (e *E2ESession) writeFileToInstance(file fileFromBytes) error {
e.logger.V(1).Info("Writing bytes to file in instance", "file", file.dstPath)

command := fmt.Sprintf("echo $'%s' >> %s && chmod %d %[2]s", file.contentString(), file.dstPath, file.permission)
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("writing file in instance: %v", err)
}
e.logger.V(1).Info("Successfully wrote file", "file", file.dstPath)
Expand All @@ -94,7 +94,7 @@ func (e *E2ESession) downloadFileInInstance(file s3Files) error {
e.logger.V(1).Info("Downloading from s3 in instance", "file", file.key)

command := fmt.Sprintf("aws s3 cp s3://%s/%s %s && chmod %d %[3]s", e.storageBucket, file.key, file.dstPath, file.permission)
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("downloading file in instance: %v", err)
}
e.logger.V(1).Info("Successfully downloaded file", "file", file.key)
Expand All @@ -105,7 +105,7 @@ func (e *E2ESession) downloadFileInInstance(file s3Files) error {
func (e *E2ESession) setUpSshAgent(privateKeyFile string) error {
command := fmt.Sprintf("eval $(ssh-agent -s) ssh-add %s", privateKeyFile)

if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("starting SSH agent on instance: %v", err)
}
e.logger.V(1).Info("Successfully started SSH agent on instance")
Expand Down
2 changes: 1 addition & 1 deletion internal/test/e2e/oidc.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ func (e *E2ESession) downloadSignerKeyInInstance(folder string) (pathInInstance
e.logger.V(1).Info("Downloading from s3 in instance", "key", saSignerKey)
command := fmt.Sprintf("aws s3 cp s3://%s/%s ./%s", e.storageBucket, saSignerKey, saSignerPath)

if err = ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err = ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return "", fmt.Errorf("downloading signer key in instance: %v", err)
}
e.logger.V(1).Info("Successfully downloaded signer key")
Expand Down
4 changes: 2 additions & 2 deletions internal/test/e2e/registryMirror.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ func (e *E2ESession) setupRegistryMirrorEnv(testRegex string) error {
func (e *E2ESession) mountRegistryCert(cert string, endpoint string) error {
command := fmt.Sprintf("sudo mkdir -p /etc/docker/certs.d/%s", endpoint)

if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("creating directory in instance: %v", err)
}
decodedCert, err := base64.StdEncoding.DecodeString(cert)
Expand All @@ -86,7 +86,7 @@ func (e *E2ESession) mountRegistryCert(cert string, endpoint string) error {
}
command = fmt.Sprintf("sudo cat <<EOF>> /etc/docker/certs.d/%s/ca.crt\n%s\nEOF", endpoint, string(decodedCert))

if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("mounting certificate in instance: %v", err)
}

Expand Down
11 changes: 10 additions & 1 deletion internal/test/e2e/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"strconv"
"strings"
"sync"
"time"

"github.com/aws/aws-sdk-go/aws/session"
"github.com/go-logr/logr"
Expand All @@ -24,6 +25,13 @@ const (

maxIPPoolSize = 10
minIPPoolSize = 1

// Default timeout for E2E test instance.
e2eTimeout = 300 * time.Minute
e2eSSMTimeoutPadding = 10 * time.Minute

// Default timeout used for all SSM commands besides running the actual E2E test.
ssmTimeout = 10 * time.Minute
)

type ParallelRunConf struct {
Expand Down Expand Up @@ -205,7 +213,7 @@ func (e *E2ESession) runTests(regex string) (testCommandResult *testCommandResul
command := "GOVERSION=go1.16.6 gotestsum --junitfile=junit-testing.xml --raw-command --format=standard-verbose --hide-summary=all --ignore-non-json-output-lines -- test2json -t -p e2e ./bin/e2e.test -test.v"

if regex != "" {
command = fmt.Sprintf("%s -test.run \"%s\"", command, regex)
command = fmt.Sprintf("%s -test.run \"%s\" -test.timeout %s", command, regex, e2eTimeout)
}

command = e.commandWithEnvVars(command)
Expand All @@ -217,6 +225,7 @@ func (e *E2ESession) runTests(regex string) (testCommandResult *testCommandResul
e.logger.V(4),
e.instanceId,
command,
e2eTimeout+e2eSSMTimeoutPadding,
opt,
)
if err != nil {
Expand Down
8 changes: 4 additions & 4 deletions internal/test/e2e/setup.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func (e *E2ESession) setup(regex string) error {
}

e.logger.V(1).Info("Waiting until SSM is ready")
err = ssm.WaitForSSMReady(e.session, e.instanceId)
err = ssm.WaitForSSMReady(e.session, e.instanceId, ssmTimeout)
if err != nil {
return fmt.Errorf("waiting for ssm in new instance: %v", err)
}
Expand Down Expand Up @@ -184,7 +184,7 @@ func (e *E2ESession) setup(regex string) error {
func (e *E2ESession) updateFSInotifyResources() error {
command := fmt.Sprintf("sudo sysctl fs.inotify.max_user_watches=%v && sudo sysctl fs.inotify.max_user_instances=%v", maxUserWatches, maxUserInstances)

if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("updating fs inotify resources: %v", err)
}
e.logger.V(1).Info("Successfully updated the fs inotify user watches and instances")
Expand Down Expand Up @@ -230,7 +230,7 @@ func (e *E2ESession) downloadRequiredFileInInstance(file string) error {
e.logger.V(1).Info("Downloading from s3 in instance", "file", file)

command := fmt.Sprintf("aws s3 cp s3://%s/%s/%[3]s ./bin/ && chmod 645 ./bin/%[3]s", e.storageBucket, e.jobId, file)
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("downloading file in instance: %v", err)
}
e.logger.V(1).Info("Successfully downloaded file")
Expand All @@ -251,7 +251,7 @@ func (e *E2ESession) downloadRequiredFilesInInstance() error {
func (e *E2ESession) createTestNameFile(testName string) error {
command := fmt.Sprintf("echo \"%s\" > %s", testName, testNameFile)

if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("creating test name file in instance: %v", err)
}
e.logger.V(1).Info("Successfully created test name file")
Expand Down
2 changes: 1 addition & 1 deletion internal/test/e2e/tinkerbell.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ func (e *E2ESession) setTinkerbellBootstrapIPInInstance(tinkInterface string) er
e.logger.V(1).Info("Setting Tinkerbell Bootstrap IP in instance")

command := fmt.Sprintf("export T_TINKERBELL_BOOTSTRAP_IP=$(/sbin/ip -o -4 addr list %s | awk '{print $4}' | cut -d/ -f1) && echo T_TINKERBELL_BOOTSTRAP_IP=\"$T_TINKERBELL_BOOTSTRAP_IP\" | tee -a /etc/environment", tinkInterface)
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command); err != nil {
if err := ssm.Run(e.session, logr.Discard(), e.instanceId, command, ssmTimeout); err != nil {
return fmt.Errorf("setting tinkerbell boostrap ip: %v", err)
}

Expand Down