From 463d3d7aa0c8d14b345d10094941f47471653fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20L=C3=B3pez?= Date: Tue, 9 Aug 2022 15:14:36 +0200 Subject: [PATCH 1/3] feat(aws-asg): add `retry_attempts` configuration to customize the maximum waiting time The maximum waiting time for checking the status of the ASG is 2 minutes, which is not configurable. When using the ASG target plugin and downscaling, the ELB connection draining could take more than 2 minutes. If this happens, Nomad will purge the nodes without waiting for the ASG to remove the instances. Logs: ``` 2022-07-28T15:48:51.939Z [ERROR] internal_plugin.aws-asg: failed to ensure all activities completed: action=scale_in asg_name=preproduction/nomad error="reached retry limit" 2022-07-28T15:48:52.123Z [INFO] internal_plugin.aws-asg: successfully purged Nomad node: node_id=34862212-f89a-b107-903d-f4036a44543d nomad_evals=["88a3c562-31e6-f303-cae3-1289bd924958", "b0b08d76-0c2c-e5f3-1b8d-d6f032cd89aa"] ``` So, this commit adds a new config property called `retry_attempts`, which is the maximum number of attempts the plugin will make to ensure that the ASG has completed all its actions. Apart from that, if this PR is accepted, I will update the [documentation](https://www.nomadproject.io/tools/autoscaling/plugins/target/aws-asg#policy-configuration-options) accordingly. Resolves #518 --- plugins/builtin/target/aws-asg/plugin/aws.go | 25 ++++++++++++++++--- .../builtin/target/aws-asg/plugin/plugin.go | 4 ++- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/plugins/builtin/target/aws-asg/plugin/aws.go b/plugins/builtin/target/aws-asg/plugin/aws.go index 1f2a7b28..1c7b6b09 100644 --- a/plugins/builtin/target/aws-asg/plugin/aws.go +++ b/plugins/builtin/target/aws-asg/plugin/aws.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "strconv" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -18,10 +19,18 @@ import ( const ( defaultRetryInterval = 10 * time.Second - defaultRetryLimit = 15 nodeAttrAWSInstanceID = "unique.platform.aws.instance-id" ) +func getConfigValue(config map[string]string, key string, defaultValue string) string { + value, ok := config[key] + if !ok { + return defaultValue + } + + return value +} + // setupAWSClients takes the passed config mapping and instantiates the // required AWS service clients. func (t *TargetPlugin) setupAWSClients(config map[string]string) error { @@ -300,7 +309,12 @@ func (t *TargetPlugin) ensureActivitiesComplete(ctx context.Context, asg string, return false, fmt.Errorf("waiting for %v activities to finish", len(ids)) } - return retry(ctx, defaultRetryInterval, defaultRetryLimit, f) + retryLimit, err := strconv.Atoi(getConfigValue(t.config, configKeyRetryAttempts, configValueRetryAttemptsDefault)) + if err != nil { + return err + } + + return retry(ctx, defaultRetryInterval, retryLimit, f) } func (t *TargetPlugin) ensureASGInstancesCount(ctx context.Context, desired int64, asgName string) error { @@ -317,7 +331,12 @@ func (t *TargetPlugin) ensureASGInstancesCount(ctx context.Context, desired int6 return false, fmt.Errorf("AutoScaling Group at %v instances of desired %v", asg.Instances, desired) } - return retry(ctx, defaultRetryInterval, defaultRetryLimit, f) + retryLimit, err := strconv.Atoi(getConfigValue(t.config, configKeyRetryAttempts, configValueRetryAttemptsDefault)) + if err != nil { + return err + } + + return retry(ctx, defaultRetryInterval, retryLimit, f) } // awsNodeIDMap is used to identify the AWS InstanceID of a Nomad node using diff --git a/plugins/builtin/target/aws-asg/plugin/plugin.go b/plugins/builtin/target/aws-asg/plugin/plugin.go index 3e9b741d..6e259f75 100644 --- a/plugins/builtin/target/aws-asg/plugin/plugin.go +++ b/plugins/builtin/target/aws-asg/plugin/plugin.go @@ -28,10 +28,12 @@ const ( configKeySessionToken = "aws_session_token" configKeyASGName = "aws_asg_name" configKeyCredentialProvider = "aws_credential_provider" + configKeyRetryAttempts = "retry_attempts" // configValues are the default values used when a configuration key is not // supplied by the operator that are specific to the plugin. - configValueRegionDefault = "us-east-1" + configValueRegionDefault = "us-east-1" + configValueRetryAttemptsDefault = "15" // credentialProvider are the valid options for the aws_credential_provider // configuration key. From 6817c5b2ea8ae9648df5ad95a4331ee8c20ca61d Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Thu, 1 Dec 2022 18:01:55 -0500 Subject: [PATCH 2/3] aws: store retry attempts value in the plugin By reading and parsing the value of `retry_attempts` earlier in `SetConfig` we can detect invalid configuration values (such as non-interger strings) earlier. --- plugins/builtin/target/aws-asg/plugin/aws.go | 24 ++----------------- .../builtin/target/aws-asg/plugin/plugin.go | 19 +++++++++++++++ 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/plugins/builtin/target/aws-asg/plugin/aws.go b/plugins/builtin/target/aws-asg/plugin/aws.go index 1c7b6b09..6ba75078 100644 --- a/plugins/builtin/target/aws-asg/plugin/aws.go +++ b/plugins/builtin/target/aws-asg/plugin/aws.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "strconv" "time" "github.com/aws/aws-sdk-go-v2/aws" @@ -22,15 +21,6 @@ const ( nodeAttrAWSInstanceID = "unique.platform.aws.instance-id" ) -func getConfigValue(config map[string]string, key string, defaultValue string) string { - value, ok := config[key] - if !ok { - return defaultValue - } - - return value -} - // setupAWSClients takes the passed config mapping and instantiates the // required AWS service clients. func (t *TargetPlugin) setupAWSClients(config map[string]string) error { @@ -309,12 +299,7 @@ func (t *TargetPlugin) ensureActivitiesComplete(ctx context.Context, asg string, return false, fmt.Errorf("waiting for %v activities to finish", len(ids)) } - retryLimit, err := strconv.Atoi(getConfigValue(t.config, configKeyRetryAttempts, configValueRetryAttemptsDefault)) - if err != nil { - return err - } - - return retry(ctx, defaultRetryInterval, retryLimit, f) + return retry(ctx, defaultRetryInterval, t.retryAttempts, f) } func (t *TargetPlugin) ensureASGInstancesCount(ctx context.Context, desired int64, asgName string) error { @@ -331,12 +316,7 @@ func (t *TargetPlugin) ensureASGInstancesCount(ctx context.Context, desired int6 return false, fmt.Errorf("AutoScaling Group at %v instances of desired %v", asg.Instances, desired) } - retryLimit, err := strconv.Atoi(getConfigValue(t.config, configKeyRetryAttempts, configValueRetryAttemptsDefault)) - if err != nil { - return err - } - - return retry(ctx, defaultRetryInterval, retryLimit, f) + return retry(ctx, defaultRetryInterval, t.retryAttempts, f) } // awsNodeIDMap is used to identify the AWS InstanceID of a Nomad node using diff --git a/plugins/builtin/target/aws-asg/plugin/plugin.go b/plugins/builtin/target/aws-asg/plugin/plugin.go index 6e259f75..f372ecf8 100644 --- a/plugins/builtin/target/aws-asg/plugin/plugin.go +++ b/plugins/builtin/target/aws-asg/plugin/plugin.go @@ -60,6 +60,10 @@ type TargetPlugin struct { logger hclog.Logger asg *autoscaling.Client + // retryAttempts is the number of times operations such as wating for a + // given ASG state should be retried. + retryAttempts int + // clusterUtils provides general cluster scaling utilities for querying the // state of nodes pools and performing scaling tasks. clusterUtils *scaleutils.ClusterScaleUtils @@ -91,6 +95,12 @@ func (t *TargetPlugin) SetConfig(config map[string]string) error { t.clusterUtils = clusterUtils t.clusterUtils.ClusterNodeIDLookupFunc = awsNodeIDMap + retryLimit, err := strconv.Atoi(getConfigValue(config, configKeyRetryAttempts, configValueRetryAttemptsDefault)) + if err != nil { + return err + } + t.retryAttempts = retryLimit + return nil } @@ -222,3 +232,12 @@ func processLastActivity(activity types.Activity, status *sdk.TargetStatus) { status.Meta[sdk.TargetStatusMetaKeyLastEvent] = strconv.FormatInt(activity.EndTime.UnixNano(), 10) } } + +func getConfigValue(config map[string]string, key string, defaultValue string) string { + value, ok := config[key] + if !ok { + return defaultValue + } + + return value +} From f203f6e78f1d1709a85e0d34375f4d6e3e97508e Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Thu, 1 Dec 2022 18:05:44 -0500 Subject: [PATCH 3/3] changelog: add entry for #594 --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4b6e0ca..471bb3ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ ## UNRELEASED +IMPROVEMENTS: + * plugin/target/aws: Add new configuration `retry_attempts` to account for potentially slow ASG update operations [[GH-594](https://github.com/hashicorp/nomad-autoscaler/pull/594)] + ## 0.3.7 (June 10, 2022) IMPROVEMENTS: