Skip to content

Commit

Permalink
feat: add configurable backoff and retries for Zarf operations
Browse files Browse the repository at this point in the history
Signed-off-by: Eddie Zaneski <eddiezane@defenseunicorns.com>
Co-authored-by: Wayne Starr <racer159@live.com>
  • Loading branch information
eddiezane and Racer159 committed Mar 2, 2024
1 parent 4d8b833 commit 07b5228
Show file tree
Hide file tree
Showing 13 changed files with 114 additions and 86 deletions.
4 changes: 3 additions & 1 deletion src/cmd/common/viper.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ const (
VPkgDeploySget = "package.deploy.sget"
VPkgDeploySkipWebhooks = "package.deploy.skip_webhooks"
VPkgDeployTimeout = "package.deploy.timeout"
VPkgRetries = "package.deploy.retries"

// Package publish config keys

Expand Down Expand Up @@ -184,7 +185,8 @@ func setDefaults() {

// Package defaults that are non-zero values
v.SetDefault(VPkgOCIConcurrency, 3)
v.SetDefault(VPkgRetries, config.ZarfDefaultRetries)

// Deploy opts that are non-zero values
v.SetDefault(VPkgDeployTimeout, config.ZarfDefaultHelmTimeout)
v.SetDefault(VPkgDeployTimeout, config.ZarfDefaultTimeout)
}
6 changes: 6 additions & 0 deletions src/cmd/dev.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,12 @@ func bindDevDeployFlags(v *viper.Viper) {

devDeployFlags.StringToStringVar(&pkgConfig.PkgOpts.SetVariables, "deploy-set", v.GetStringMapString(common.VPkgDeploySet), lang.CmdPackageDeployFlagSet)

// Always require adopt-existing-resources flag (no viper)
devDeployFlags.BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)
devDeployFlags.BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)
devDeployFlags.DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

devDeployFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
devDeployFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageDeployFlagComponents)

devDeployFlags.BoolVar(&pkgConfig.CreateOpts.NoYOLO, "no-yolo", v.GetBool(common.VDevDeployNoYolo), lang.CmdDevDeployFlagNoYolo)
Expand Down
3 changes: 1 addition & 2 deletions src/cmd/initialize.go
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,10 @@ func init() {
// Flags that control how a deployment proceeds
// Always require adopt-existing-resources flag (no viper)
initCmd.Flags().BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)

initCmd.Flags().BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)

initCmd.Flags().DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

initCmd.Flags().IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
initCmd.Flags().StringVarP(&pkgConfig.PkgOpts.PublicKeyPath, "key", "k", v.GetString(common.VPkgPublicKey), lang.CmdPackageFlagFlagPublicKey)

initCmd.Flags().SortFlags = true
Expand Down
6 changes: 4 additions & 2 deletions src/cmd/package.go
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,8 @@ func bindCreateFlags(v *viper.Viper) {
createFlags.StringVarP(&pkgConfig.CreateOpts.SigningKeyPath, "key", "k", v.GetString(common.VPkgCreateSigningKey), lang.CmdPackageCreateFlagDeprecatedKey)
createFlags.StringVar(&pkgConfig.CreateOpts.SigningKeyPassword, "key-pass", v.GetString(common.VPkgCreateSigningKeyPassword), lang.CmdPackageCreateFlagDeprecatedKeyPassword)

createFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)

createFlags.MarkHidden("output-directory")
createFlags.MarkHidden("key")
createFlags.MarkHidden("key-pass")
Expand All @@ -377,11 +379,10 @@ func bindDeployFlags(v *viper.Viper) {

// Always require adopt-existing-resources flag (no viper)
deployFlags.BoolVar(&pkgConfig.DeployOpts.AdoptExistingResources, "adopt-existing-resources", false, lang.CmdPackageDeployFlagAdoptExistingResources)

deployFlags.BoolVar(&pkgConfig.DeployOpts.SkipWebhooks, "skip-webhooks", v.GetBool(common.VPkgDeploySkipWebhooks), lang.CmdPackageDeployFlagSkipWebhooks)

deployFlags.DurationVar(&pkgConfig.DeployOpts.Timeout, "timeout", v.GetDuration(common.VPkgDeployTimeout), lang.CmdPackageDeployFlagTimeout)

deployFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
deployFlags.StringToStringVar(&pkgConfig.PkgOpts.SetVariables, "set", v.GetStringMapString(common.VPkgDeploySet), lang.CmdPackageDeployFlagSet)
deployFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageDeployFlagComponents)
deployFlags.StringVar(&pkgConfig.PkgOpts.Shasum, "shasum", v.GetString(common.VPkgDeployShasum), lang.CmdPackageDeployFlagShasum)
Expand All @@ -403,6 +404,7 @@ func bindMirrorFlags(v *viper.Viper) {

mirrorFlags.BoolVar(&pkgConfig.MirrorOpts.NoImgChecksum, "no-img-checksum", false, lang.CmdPackageMirrorFlagNoChecksum)

mirrorFlags.IntVar(&pkgConfig.PkgOpts.Retries, "retries", v.GetInt(common.VPkgRetries), lang.CmdPackageFlagRetries)
mirrorFlags.StringVar(&pkgConfig.PkgOpts.OptionalComponents, "components", v.GetString(common.VPkgDeployComponents), lang.CmdPackageMirrorFlagComponents)

// Flags for using an external Git server
Expand Down
7 changes: 5 additions & 2 deletions src/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,11 @@ var (
operationStartTime = time.Now().Unix()
dataInjectionMarker = ".zarf-injection-%d"

ZarfDefaultCachePath = filepath.Join("~", ".zarf-cache")
ZarfDefaultHelmTimeout = 15 * time.Minute
ZarfDefaultCachePath = filepath.Join("~", ".zarf-cache")

// Default Time Vars
ZarfDefaultTimeout = 15 * time.Minute
ZarfDefaultRetries = 3
)

// GetArch returns the arch based on a priority list with options for overriding.
Expand Down
1 change: 1 addition & 0 deletions src/config/lang/english.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ $ zarf init --artifact-push-password={PASSWORD} --artifact-push-username={USERNA
CmdPackageShort = "Zarf package commands for creating, deploying, and inspecting packages"
CmdPackageFlagConcurrency = "Number of concurrent layer operations to perform when interacting with a remote package."
CmdPackageFlagFlagPublicKey = "Path to public key file for validating signed packages"
CmdPackageFlagRetries = "Number of retries to perform for Zarf deploy operations like git/image pushes or Helm installs"

CmdPackageCreateShort = "Creates a Zarf package from a given directory or the current directory"
CmdPackageCreateLong = "Builds an archive of resources and dependencies defined by the 'zarf.yaml' in the specified directory.\n" +
Expand Down
97 changes: 48 additions & 49 deletions src/internal/packager/helm/chart.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
package helm

import (
"errors"
"fmt"
"github.com/defenseunicorns/zarf/src/pkg/utils/helpers"
"time"

"github.com/Masterminds/semver/v3"
Expand All @@ -24,9 +26,6 @@ import (
"helm.sh/helm/v3/pkg/storage/driver"
)

// Set the default number of Helm install/upgrade attempts to 3
const defaultHelmAttempts = 3

// InstallOrUpgradeChart performs a helm install of the given chart.
func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
fromMessage := h.chart.URL
Expand Down Expand Up @@ -63,49 +62,18 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
return nil, "", fmt.Errorf("unable to create helm renderer: %w", err)
}

attempt := 0
for {
attempt++

histClient := action.NewHistory(h.actionConfig)
// TODO: DO_NOT_SUBMIT
// Can this client be reused? It wasn't before
histClient := action.NewHistory(h.actionConfig)
tryHelm := func(attempt int) (error, bool) {
var err error
releases, histErr := histClient.Run(h.chart.ReleaseName)

if attempt > 3 {
previouslyDeployedVersion := 0

// Check for previous releases that successfully deployed
for _, release := range releases {
if release.Info.Status == "deployed" {
previouslyDeployedVersion = release.Version
}
}

// On total failure try to rollback (if there was a previously deployed version) or uninstall.
if previouslyDeployedVersion > 0 {
spinner.Updatef("Performing chart rollback")

err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", defaultHelmAttempts, err)
}

return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", defaultHelmAttempts)
}

spinner.Updatef("Performing chart uninstall")
_, err = h.uninstallChart(h.chart.ReleaseName)
if err != nil {
return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", defaultHelmAttempts, err)
}

return nil, "", fmt.Errorf("unable to install chart after %d attempts", defaultHelmAttempts)
}

spinner.Updatef("Attempt %d of %d to install chart", attempt, defaultHelmAttempts)
spinner.Updatef("Attempt %d of %d to install chart", attempt, h.retries)

spinner.Updatef("Checking for existing helm deployment")

if histErr == driver.ErrReleaseNotFound {
if errors.Is(histErr, driver.ErrReleaseNotFound) {
// No prior release, try to install it.
spinner.Updatef("Attempting chart installation")

Expand All @@ -119,19 +87,50 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
output, err = h.upgradeChart(lastRelease, postRender)
} else {
// 😭 things aren't working
return nil, "", fmt.Errorf("unable to verify the chart installation status: %w", histErr)
return fmt.Errorf("unable to verify the chart installation status: %w", histErr), true
}

if err != nil {
message.Warnf("Unable to complete helm chart install/upgrade, waiting 10 seconds and trying again: %s", err.Error())
return fmt.Errorf("unable to complete the helm chart install/upgrade: %w", err), false
}

// Simply wait for dust to settle and try again.
time.Sleep(10 * time.Second)
} else {
message.Debug(output.Info.Description)
spinner.Success()
break
message.Debug(output.Info.Description)
spinner.Success()
return nil, false
}

err = helpers.Retry(tryHelm, h.retries, 5*time.Second, message.Warnf)
if err != nil {
// Try to rollback any deployed releases
releases, _ := histClient.Run(h.chart.ReleaseName)
previouslyDeployedVersion := 0

// Check for previous releases that successfully deployed
for _, release := range releases {
if release.Info.Status == "deployed" {
previouslyDeployedVersion = release.Version
}
}

// On total failure try to rollback (if there was a previously deployed version) or uninstall.
if previouslyDeployedVersion > 0 {
spinner.Updatef("Performing chart rollback")

err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", h.retries, err)
}

return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", h.retries)
}

spinner.Updatef("Performing chart uninstall")
_, err = h.uninstallChart(h.chart.ReleaseName)
if err != nil {
return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", h.retries, err)
}

return nil, "", fmt.Errorf("unable to install chart after %d attempts", h.retries)
}

// return any collected connect strings for zarf connect.
Expand Down
13 changes: 8 additions & 5 deletions src/internal/packager/helm/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ import (
"time"

"github.com/defenseunicorns/zarf/src/config"
"github.com/defenseunicorns/zarf/src/pkg/message"
"github.com/defenseunicorns/zarf/src/pkg/cluster"
"github.com/defenseunicorns/zarf/src/pkg/message"
"github.com/defenseunicorns/zarf/src/types"
"helm.sh/helm/v3/pkg/action"
"helm.sh/helm/v3/pkg/chart"
Expand All @@ -33,6 +33,7 @@ type Helm struct {
component types.ZarfComponent
cluster *cluster.Cluster
timeout time.Duration
retries int

kubeVersion string

Expand All @@ -52,7 +53,7 @@ func New(chart types.ZarfChart, chartPath string, valuesPath string, mods ...Mod
chart: chart,
chartPath: chartPath,
valuesPath: valuesPath,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
}

for _, mod := range mods {
Expand All @@ -67,7 +68,8 @@ func NewClusterOnly(cfg *types.PackagerConfig, cluster *cluster.Cluster) *Helm {
return &Helm{
cfg: cfg,
cluster: cluster,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
retries: config.ZarfDefaultRetries,
}
}

Expand Down Expand Up @@ -118,7 +120,7 @@ func NewFromZarfManifest(manifest types.ZarfManifest, manifestPath, packageName,
NoWait: manifest.NoWait,
},
chartOverride: tmpChart,
timeout: config.ZarfDefaultHelmTimeout,
timeout: config.ZarfDefaultTimeout,
}

for _, mod := range mods {
Expand All @@ -131,13 +133,14 @@ func NewFromZarfManifest(manifest types.ZarfManifest, manifestPath, packageName,
}

// WithDeployInfo adds the necessary information to deploy a given chart
func WithDeployInfo(component types.ZarfComponent, cfg *types.PackagerConfig, cluster *cluster.Cluster, valuesOverrides map[string]any, timeout time.Duration) Modifier {
func WithDeployInfo(component types.ZarfComponent, cfg *types.PackagerConfig, cluster *cluster.Cluster, valuesOverrides map[string]any, timeout time.Duration, retries int) Modifier {
return func(h *Helm) {
h.component = component
h.cfg = cfg
h.cluster = cluster
h.valuesOverrides = valuesOverrides
h.timeout = timeout
h.retries = retries
}
}

Expand Down
8 changes: 4 additions & 4 deletions src/pkg/packager/create_stages.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func (p *Packager) assemble() error {
var pulled []images.ImgInfo
var err error

doPull := func() error {
doPull := func(_ int) (error, bool) {
imgConfig := images.ImageConfig{
ImagesPath: p.layout.Images.Base,
ImageList: imageList,
Expand All @@ -158,11 +158,11 @@ func (p *Packager) assemble() error {
}

pulled, err = imgConfig.PullAll()
return err
return err, false
}

if err := helpers.Retry(doPull, 3, 5*time.Second, message.Warnf); err != nil {
return fmt.Errorf("unable to pull images after 3 attempts: %w", err)
if err := helpers.Retry(doPull, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf); err != nil {
return fmt.Errorf("unable to pull images after %d attempts: %w", p.cfg.PkgOpts.Retries, err)
}

for _, imgInfo := range pulled {
Expand Down
28 changes: 15 additions & 13 deletions src/pkg/packager/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -468,16 +468,16 @@ func (p *Packager) pushImagesToRegistry(componentImages []string, noImgChecksum
Architectures: []string{p.cfg.Pkg.Metadata.Architecture, p.cfg.Pkg.Build.Architecture},
}

return helpers.Retry(func() error {
return imgConfig.PushToZarfRegistry()
}, 3, 5*time.Second, message.Warnf)
return helpers.Retry(func(_ int) (error, bool) {
return imgConfig.PushToZarfRegistry(), false
}, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf)
}

// Push all of the components git repos to the configured git server.
func (p *Packager) pushReposToRepository(reposPath string, repos []string) error {
for _, repoURL := range repos {
// Create an anonymous function to push the repo to the Zarf git server
tryPush := func() error {
tryPush := func(_ int) (error, bool) {
gitClient := git.New(p.cfg.State.GitServer)
svcInfo, _ := k8s.ServiceInfoFromServiceURL(gitClient.Server.Address)

Expand All @@ -489,30 +489,30 @@ func (p *Packager) pushReposToRepository(reposPath string, repos []string) error
if !p.isConnectedToCluster() {
err := p.connectToCluster(5 * time.Second)
if err != nil {
return err
return err, false
}
}

tunnel, err = p.cluster.NewTunnel(svcInfo.Namespace, k8s.SvcResource, svcInfo.Name, "", 0, svcInfo.Port)
if err != nil {
return err
return err, false
}

_, err = tunnel.Connect()
if err != nil {
return err
return err, false
}
defer tunnel.Close()
gitClient.Server.Address = tunnel.HTTPEndpoint()

return tunnel.Wrap(func() error { return gitClient.PushRepo(repoURL, reposPath) })
return tunnel.Wrap(func() error { return gitClient.PushRepo(repoURL, reposPath) }), false
}

return gitClient.PushRepo(repoURL, reposPath)
return gitClient.PushRepo(repoURL, reposPath), false
}

// Try repo push up to 3 times
if err := helpers.Retry(tryPush, 3, 5*time.Second, message.Warnf); err != nil {
// Try repo push up to retry limit
if err := helpers.Retry(tryPush, p.cfg.PkgOpts.Retries, 5*time.Second, message.Warnf); err != nil {
return fmt.Errorf("unable to push repo %s to the Git Server: %w", repoURL, err)
}
}
Expand Down Expand Up @@ -549,7 +549,8 @@ func (p *Packager) installChartAndManifests(componentPaths *layout.ComponentPath
p.cfg,
p.cluster,
valuesOverrides,
p.cfg.DeployOpts.Timeout),
p.cfg.DeployOpts.Timeout,
p.cfg.PkgOpts.Retries),
)

addedConnectStrings, installedChartName, err := helmCfg.InstallOrUpgradeChart()
Expand Down Expand Up @@ -596,7 +597,8 @@ func (p *Packager) installChartAndManifests(componentPaths *layout.ComponentPath
p.cfg,
p.cluster,
nil,
p.cfg.DeployOpts.Timeout),
p.cfg.DeployOpts.Timeout,
p.cfg.PkgOpts.Retries),
)
if err != nil {
return installedCharts, err
Expand Down
Loading

0 comments on commit 07b5228

Please sign in to comment.