Skip to content

Commit

Permalink
Make retryable errors configurable (#1232)
Browse files Browse the repository at this point in the history
This change adds the retryable_errors configuration attribute
which enables users to specify their own custom list of
retryable errors
  • Loading branch information
infraredgirl authored Oct 9, 2020
1 parent f8a3361 commit 7f0920a
Show file tree
Hide file tree
Showing 16 changed files with 208 additions and 15 deletions.
5 changes: 5 additions & 0 deletions cli/cli_app.go
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,11 @@ func RunTerragrunt(terragruntOptions *options.TerragruntOptions) error {
terragruntOptions.DownloadDir = terragruntConfig.DownloadDir
}

// Override the default value of retryable errors using the value set in the config file
if terragruntConfig.RetryableErrors != nil {
terragruntOptions.RetryableErrors = terragruntConfig.RetryableErrors
}

if sourceUrl := getTerraformSourceUrl(terragruntOptions, terragruntConfig); sourceUrl != "" {
if err := downloadTerraformSource(sourceUrl, terragruntOptions, terragruntConfig); err != nil {
return err
Expand Down
10 changes: 10 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type TerragruntConfig struct {
Locals map[string]interface{}
TerragruntDependencies []Dependency
GenerateConfigs map[string]codegen.GenerateConfig
RetryableErrors []string

// Indicates whether or not this is the result of a partial evaluation
IsPartial bool
Expand All @@ -65,6 +66,7 @@ type terragruntConfigFile struct {
IamRole *string `hcl:"iam_role,attr"`
TerragruntDependencies []Dependency `hcl:"dependency,block"`
GenerateBlocks []terragruntGenerateBlock `hcl:"generate,block"`
RetryableErrors []string `hcl:"retryable_errors,optional"`

// This struct is used for validating and parsing the entire terragrunt config. Since locals are evaluated in a
// completely separate cycle, it should not be evaluated here. Otherwise, we can't support self referencing other
Expand Down Expand Up @@ -568,6 +570,10 @@ func mergeConfigWithIncludedConfig(config *TerragruntConfig, includedConfig *Ter
includedConfig.TerraformBinary = config.TerraformBinary
}

if config.RetryableErrors != nil {
includedConfig.RetryableErrors = config.RetryableErrors
}

if config.TerragruntVersionConstraint != "" {
includedConfig.TerragruntVersionConstraint = config.TerragruntVersionConstraint
}
Expand Down Expand Up @@ -731,6 +737,10 @@ func convertToTerragruntConfig(
terragruntConfig.TerraformBinary = *terragruntConfigFromFile.TerraformBinary
}

if terragruntConfigFromFile.RetryableErrors != nil {
terragruntConfig.RetryableErrors = terragruntConfigFromFile.RetryableErrors
}

if terragruntConfigFromFile.DownloadDir != nil {
terragruntConfig.DownloadDir = *terragruntConfigFromFile.DownloadDir
}
Expand Down
8 changes: 8 additions & 0 deletions config/config_as_cty.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,14 @@ func terragruntConfigAsCty(config *TerragruntConfig) (cty.Value, error) {
output["generate"] = generateCty
}

retryableCty, err := gostructToCty(config.RetryableErrors)
if err != nil {
return cty.NilVal, err
}
if retryableCty != cty.NilVal {
output["retryable_errors"] = retryableCty
}

inputsCty, err := convertToCtyWithJson(config.Inputs)
if err != nil {
return cty.NilVal, err
Expand Down
2 changes: 2 additions & 0 deletions config/config_as_cty_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ func terragruntConfigStructFieldToMapKey(t *testing.T, fieldName string) (string
return "generate", true
case "IsPartial":
return "", false
case "RetryableErrors":
return "retryable_errors", true
default:
t.Fatalf("Unknown struct property: %s", fieldName)
// This should not execute
Expand Down
56 changes: 50 additions & 6 deletions config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func TestParseTerragruntJsonConfigRemoteStateMinimalConfig(t *testing.T) {
require.NoError(t, err)

assert.Nil(t, terragruntConfig.Terraform)

assert.Nil(t, terragruntConfig.RetryableErrors)
assert.Empty(t, terragruntConfig.IamRole)

if assert.NotNil(t, terragruntConfig.RemoteState) {
Expand Down Expand Up @@ -109,7 +109,7 @@ remote_state {
}

assert.Nil(t, terragruntConfig.Terraform)

assert.Nil(t, terragruntConfig.RetryableErrors)
assert.Empty(t, terragruntConfig.IamRole)

if assert.NotNil(t, terragruntConfig.RemoteState) {
Expand Down Expand Up @@ -145,7 +145,7 @@ func TestParseTerragruntJsonConfigRemoteStateFullConfig(t *testing.T) {
}

assert.Nil(t, terragruntConfig.Terraform)

assert.Nil(t, terragruntConfig.RetryableErrors)
assert.Empty(t, terragruntConfig.IamRole)

if assert.NotNil(t, terragruntConfig.RemoteState) {
Expand All @@ -158,6 +158,48 @@ func TestParseTerragruntJsonConfigRemoteStateFullConfig(t *testing.T) {
}
}

func TestParseTerragruntHclConfigRetryableErrors(t *testing.T) {
t.Parallel()

config := `
retryable_errors = [
"My own little error",
"Another one of my errors"
]
`
terragruntConfig, err := ParseConfigString(config, mockOptionsForTest(t), nil, DefaultTerragruntConfigPath)
require.NoError(t, err)

assert.Nil(t, terragruntConfig.Terraform)
assert.Empty(t, terragruntConfig.IamRole)

if assert.NotNil(t, terragruntConfig.RetryableErrors) {
assert.Equal(t, []string{"My own little error", "Another one of my errors"}, terragruntConfig.RetryableErrors)
}
}

func TestParseTerragruntJsonConfigRetryableErrors(t *testing.T) {
t.Parallel()

config := `
{
"retryable_errors": [
"My own little error"
]
}
`

terragruntConfig, err := ParseConfigString(config, mockOptionsForTest(t), nil, DefaultTerragruntJsonConfigPath)
require.NoError(t, err)

assert.Nil(t, terragruntConfig.Terraform)
assert.Empty(t, terragruntConfig.IamRole)

if assert.NotNil(t, terragruntConfig.RetryableErrors) {
assert.Equal(t, []string{"My own little error"}, terragruntConfig.RetryableErrors)
}
}

func TestParseIamRole(t *testing.T) {
t.Parallel()

Expand All @@ -171,6 +213,7 @@ func TestParseIamRole(t *testing.T) {
assert.Nil(t, terragruntConfig.RemoteState)
assert.Nil(t, terragruntConfig.Terraform)
assert.Nil(t, terragruntConfig.Dependencies)
assert.Nil(t, terragruntConfig.RetryableErrors)

assert.Equal(t, "terragrunt-iam-role", terragruntConfig.IamRole)
}
Expand All @@ -191,6 +234,7 @@ dependencies {

assert.Nil(t, terragruntConfig.RemoteState)
assert.Nil(t, terragruntConfig.Terraform)
assert.Nil(t, terragruntConfig.RetryableErrors)

assert.Empty(t, terragruntConfig.IamRole)

Expand All @@ -215,7 +259,7 @@ dependencies {

assert.Nil(t, terragruntConfig.RemoteState)
assert.Nil(t, terragruntConfig.Terraform)

assert.Nil(t, terragruntConfig.RetryableErrors)
assert.Empty(t, terragruntConfig.IamRole)

if assert.NotNil(t, terragruntConfig.Dependencies) {
Expand Down Expand Up @@ -254,7 +298,7 @@ dependencies {
require.NotNil(t, terragruntConfig.Terraform)
require.NotNil(t, terragruntConfig.Terraform.Source)
assert.Equal(t, "foo", *terragruntConfig.Terraform.Source)

assert.Nil(t, terragruntConfig.RetryableErrors)
assert.Empty(t, terragruntConfig.IamRole)

if assert.NotNil(t, terragruntConfig.RemoteState) {
Expand Down Expand Up @@ -302,7 +346,7 @@ func TestParseTerragruntJsonConfigRemoteStateDynamoDbTerraformConfigAndDependenc
require.NotNil(t, terragruntConfig.Terraform)
require.NotNil(t, terragruntConfig.Terraform.Source)
assert.Equal(t, "foo", *terragruntConfig.Terraform.Source)

assert.Nil(t, terragruntConfig.RetryableErrors)
assert.Empty(t, terragruntConfig.IamRole)

if assert.NotNil(t, terragruntConfig.RemoteState) {
Expand Down
32 changes: 25 additions & 7 deletions docs/_docs/02_features/auto-retry.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,34 @@ Terraform can fail with transient errors which can be addressed by simply retryi

**Example**

$ terragrunt apply
...
Initializing provider plugins...
- Checking for available provider plugins on https://releases.hashicorp.com...
Error installing provider "template": error fetching checksums: Get https://releases.hashicorp.com/terraform-provider-template/1.0.0/terraform-provider-template_1.0.0_SHA256SUMS: net/http: TLS handshake timeout.
```
$ terragrunt apply
...
Initializing provider plugins...
- Checking for available provider plugins on https://releases.hashicorp.com...
Error installing provider "template": error fetching checksums: Get https://releases.hashicorp.com/terraform-provider-template/1.0.0/terraform-provider-template_1.0.0_SHA256SUMS: net/http: TLS handshake timeout.
```

Terragrunt sees this error, and knows it is a transient error that can addressed by re-running the `apply` command.
Terragrunt sees this error, and knows it is a transient error that can be addressed by re-running the `apply` command.

`auto-retry` will try a maximum of three times to re-run the command, at which point it will deem the error as not transient, and accept the terraform failure. Retries will occur when the error is encountered, pausing for 5 seconds between retries.

Known errors that `auto-retry` will rerun, are maintained in the `TerragruntOptions.RetryableErrors` array. Future upgrades to `terragrunt` may include the ability to configure `auto-retry` by specifying additional error strings and configuring max retries and retry intervals the `terragrunt` config (PRs welcome\!).
Terragrunt has a small list of default known errors built-in. You can override these defaults with your own custom retryable errors in your `terragrunt.hcl` configuration:
```hcl
retryable_errors = [
"a regex to match the error",
"another regex"
]
```

E.g:
```hcl
retryable_errors = [
"(?s).*Error installing provider.*tcp.*connection reset by peer.*",
"(?s).*ssh_exchange_identification.*Connection closed by remote host.*"
]
```

Future upgrades to `terragrunt` may include the ability to configure max retries and retry intervals in the `terragrunt` config (PRs welcome\!).

To disable `auto-retry`, use the `--terragrunt-no-auto-retry` command line option or set the `TERRAGRUNT_AUTO_RETRY` environment variable to `false`.
15 changes: 15 additions & 0 deletions docs/_docs/04_reference/config-blocks-and-attributes.md
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ EOF
- [terraform_binary](#terraform_binary)
- [terraform_version_constraint](#terraform_version_constraint)
- [terragrunt_version_constraint](#terragrunt_version_constraint)
- [retryable_errors](#retryable_errors)


### inputs
Expand Down Expand Up @@ -695,3 +696,17 @@ Example:
```hcl
terragrunt_version_constraint = ">= 0.23"
```

### retryable_errors

The terragrunt `retryable_errors` list can be used to override the default list of retryable errors with your own custom list.
To learn more about the `retryable_errors` attribute, see the [auto-retry feature overview](/docs/features/auto-retry).

Example:

```hcl
retryable_errors = [
"(?s).*Error installing provider.*tcp.*connection reset by peer.*",
"(?s).*ssh_exchange_identification.*Connection closed by remote host.*"
]
```
3 changes: 2 additions & 1 deletion options/auto_retry_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ const DEFAULT_SLEEP = 5 * time.Second

// List of recurring transient errors encountered when calling terraform
// If any of these match, we'll retry the command
var RETRYABLE_ERRORS = []string{
var DEFAULT_RETRYABLE_ERRORS = []string{
"(?s).*Failed to load state.*tcp.*timeout.*",
"(?s).*Failed to load backend.*TLS handshake timeout.*",
"(?s).*Creating metric alarm failed.*request to update this alarm is in progress.*",
Expand All @@ -18,4 +18,5 @@ var RETRYABLE_ERRORS = []string{
"NoSuchBucket: The specified bucket does not exist",
"(?s).*Error creating SSM parameter: TooManyUpdates:.*",
"(?s).*app.terraform.io.*: 429 Too Many Requests.*",
"(?s).*ssh_exchange_identification.*Connection closed by remote host.*",
}
2 changes: 1 addition & 1 deletion options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ func NewTerragruntOptions(terragruntConfigPath string) (*TerragruntOptions, erro
AutoRetry: true,
MaxRetryAttempts: DEFAULT_MAX_RETRY_ATTEMPTS,
Sleep: DEFAULT_SLEEP,
RetryableErrors: util.CloneStringList(RETRYABLE_ERRORS),
RetryableErrors: util.CloneStringList(DEFAULT_RETRYABLE_ERRORS),
ExcludeDirs: []string{},
IncludeDirs: []string{},
StrictInclude: false,
Expand Down
16 changes: 16 additions & 0 deletions test/fixture-auto-retry/custom-errors-not-set/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
resource "random_id" "filename" {
byte_length = 8
}

resource "null_resource" "tf_retryable_error" {
triggers = {
always_recreate = timestamp()
}

provisioner "local-exec" {
// The command will fail with a custom retryable error that matches the config the first time it's run,
// and succeed on the 2nd run
command = "${path.module}/script.sh ${random_id.filename.hex}"
interpreter = ["/bin/bash", "-c"]
}
}
13 changes: 13 additions & 0 deletions test/fixture-auto-retry/custom-errors-not-set/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

FILENAME="/tmp/$1"

if test -f "$FILENAME"; then
echo "Success"
rm "$FILENAME"
exit 0
else
touch "$FILENAME"
echo "My own little error"
exit 1
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# No retryable_errors set
16 changes: 16 additions & 0 deletions test/fixture-auto-retry/custom-errors/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
resource "random_id" "filename" {
byte_length = 8
}

resource "null_resource" "tf_retryable_error" {
triggers = {
always_recreate = timestamp()
}

provisioner "local-exec" {
// The command will fail with a custom retryable error that matches the config the first time it's run,
// and succeed on the 2nd run
command = "${path.module}/script.sh ${random_id.filename.hex}"
interpreter = ["/bin/bash", "-c"]
}
}
13 changes: 13 additions & 0 deletions test/fixture-auto-retry/custom-errors/script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

FILENAME="/tmp/$1"

if test -f "$FILENAME"; then
echo "Success"
rm "$FILENAME"
exit 0
else
touch "$FILENAME"
echo "My own little error"
exit 1
fi
3 changes: 3 additions & 0 deletions test/fixture-auto-retry/custom-errors/terragrunt.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
retryable_errors = [
"My own little error"
]
28 changes: 28 additions & 0 deletions test/integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ const (
TEST_FIXTURE_EXIT_CODE = "fixture-exit-code"
TEST_FIXTURE_AUTO_RETRY_RERUN = "fixture-auto-retry/re-run"
TEST_FIXTURE_AUTO_RETRY_EXHAUST = "fixture-auto-retry/exhaust"
TEST_FIXTURE_AUTO_RETRY_CUSTOM_ERRORS = "fixture-auto-retry/custom-errors"
TEST_FIXTURE_AUTO_RETRY_CUSTOM_ERRORS_NOT_SET = "fixture-auto-retry/custom-errors-not-set"
TEST_FIXTURE_AUTO_RETRY_APPLY_ALL_RETRIES = "fixture-auto-retry/apply-all"
TEST_FIXTURE_AWS_PROVIDER_PATCH = "fixture-aws-provider-patch"
TEST_FIXTURE_INPUTS = "fixture-inputs"
Expand Down Expand Up @@ -1198,6 +1200,32 @@ func TestAutoRetryExhaustRetries(t *testing.T) {
assert.NotContains(t, out.String(), "Apply complete!")
}

func TestAutoRetryCustomRetryableErrors(t *testing.T) {
t.Parallel()

out := new(bytes.Buffer)
rootPath := copyEnvironment(t, TEST_FIXTURE_AUTO_RETRY_CUSTOM_ERRORS)
modulePath := util.JoinPath(rootPath, TEST_FIXTURE_AUTO_RETRY_CUSTOM_ERRORS)
err := runTerragruntCommand(t, fmt.Sprintf("terragrunt apply --auto-approve --terragrunt-non-interactive --terragrunt-working-dir %s", modulePath), out, os.Stderr)

assert.Nil(t, err)
assert.Contains(t, out.String(), "My own little error")
assert.Contains(t, out.String(), "Apply complete!")
}

func TestAutoRetryCustomRetryableErrorsFailsWhenRetryableErrorsNotSet(t *testing.T) {
t.Parallel()

out := new(bytes.Buffer)
rootPath := copyEnvironment(t, TEST_FIXTURE_AUTO_RETRY_CUSTOM_ERRORS_NOT_SET)
modulePath := util.JoinPath(rootPath, TEST_FIXTURE_AUTO_RETRY_CUSTOM_ERRORS_NOT_SET)
err := runTerragruntCommand(t, fmt.Sprintf("terragrunt apply --auto-approve --terragrunt-non-interactive --terragrunt-working-dir %s", modulePath), out, os.Stderr)

assert.NotNil(t, err)
assert.Contains(t, out.String(), "My own little error")
assert.NotContains(t, out.String(), "Apply complete!")
}

func TestAutoRetryFlagWithRecoverableError(t *testing.T) {
t.Parallel()

Expand Down

0 comments on commit 7f0920a

Please sign in to comment.