🌟 enhancement(VSecM Sentinel): refactored forever loops (#946)

updated infinite loops with early crashes instead Signed-off-by: Volkan Özçelik <ovolkan@vmware.com>
vmware-tanzu · Apr 26, 2024 · 3ec6188 · 3ec6188
1 parent 77cbb17
commit 3ec6188
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 167 deletions.
diff --git a/app/sentinel/background/initialization/connectivity.go b/app/sentinel/background/initialization/connectivity.go
@@ -13,94 +13,85 @@ package initialization
 import (
  "context"
  "github.com/pkg/errors"
+ "github.com/spiffe/go-spiffe/v2/workloadapi"
  "time"
 
  "github.com/vmware-tanzu/secrets-manager/app/sentinel/internal/safe"
  "github.com/vmware-tanzu/secrets-manager/core/backoff"
- "github.com/vmware-tanzu/secrets-manager/core/env"
  log "github.com/vmware-tanzu/secrets-manager/core/log/std"
  "github.com/vmware-tanzu/secrets-manager/core/spiffe"
 )
 
 func ensureApiConnectivity(ctx context.Context, cid *string) {
- terminateAsap := env.TerminateSentinelOnInitCommandConnectivityFailure()
-
  log.TraceLn(cid, "Before checking api connectivity")
 
- for {
- s := backoffStrategy()
-
- err := backoff.Retry("RunInitCommands:CheckConnectivity", func() error {
- log.TraceLn(cid, "RunInitCommands:CheckConnectivity: checking connectivity to safe")
+ s := backoffStrategy()
+ err := backoff.Retry("RunInitCommands:CheckConnectivity", func() error {
+ log.TraceLn(cid, "RunInitCommands:CheckConnectivity: checking connectivity to safe")
 
- src, acquired := spiffe.AcquireSourceForSentinel(ctx)
- if !acquired {
- log.TraceLn(cid, "RunInitCommands:CheckConnectivity: failed to acquire source.")
- if terminateAsap {
- panic("RunInitCommands:CheckConnectivity: failed to acquire source")
- }
+ src, acquired := spiffe.AcquireSourceForSentinel(ctx)
+ if !acquired {
+ log.TraceLn(cid, "RunInitCommands:CheckConnectivity: failed to acquire source.")
 
-  return errors.New("RunInitCommands:CheckConnectivity: failed to acquire source")
-  }
+ return errors.New("RunInitCommands:CheckConnectivity: failed to acquire source")
+ }
 
-  log.TraceLn(cid, "RunInitCommands:CheckConnectivity: acquired source successfully")
+ log.TraceLn(cid, "RunInitCommands:CheckConnectivity: acquired source successfully")
 
- if err := safe.Check(ctx, src); err != nil {
- log.TraceLn(cid, "RunInitCommands:CheckConnectivity: failed to verify connection to safe:", err.Error())
- if terminateAsap {
- panic("RunInitCommands:CheckConnectivity: failed to verify connection to safe")
- }
+ if err := safe.Check(ctx, src); err != nil {
+ log.TraceLn(cid, "RunInitCommands:CheckConnectivity: failed to verify connection to safe:", err.Error())
 
-  return errors.Wrap(err, "RunInitCommands:CheckConnectivity: cannot establish connection to safe 001")
-  }
+ return errors.Wrap(err, "RunInitCommands:CheckConnectivity: cannot establish connection to safe 001")
+ }
 
-  log.TraceLn(cid, "RunInitCommands:CheckConnectivity: success")
-  return nil
-  }, s)
+ log.TraceLn(cid, "RunInitCommands:CheckConnectivity: success")
+ return nil
+ }, s)
 
- if err == nil {
- log.TraceLn(cid, "exiting backoffs")
- break
- }
+ if err == nil {
+ log.TraceLn(cid, "exiting backoffs")
+ return
  }
+
+ // I shouldn't be here.
+ panic("RunInitCommands:CheckConnectivity: failed to verify connection to safe")
 }
 
-func ensureSourceAcquisition(ctx context.Context, cid *string) {
+func ensureSourceAcquisition(ctx context.Context, cid *string) *workloadapi.X509Source {
  // If `true`, instead of retrying with a backoff, kill the pod, and let the
  // deployment controller restart it to initiate a new retry.
- terminateAsap := env.TerminateSentinelOnInitCommandConnectivityFailure()
 
- waitInterval := env.InitCommandRunnerWaitIntervalForSentinel()
- time.Sleep(waitInterval)
+ log.TraceLn(cid, "RunInitCommands: acquiring source 001")
 
- for {
- log.TraceLn(cid, "RunInitCommands: acquiring source 001")
+ s := backoff.Strategy{
+ MaxRetries: 20,
+ Delay: 1000,
+ Exponential: true,
+ MaxDuration: 30 * time.Second,
+ }
 
- s := backoff.Strategy{
- MaxRetries: 20,
- Delay: 1000,
- Exponential: true,
- MaxDuration: 30 * time.Second,
- }
+ var src *workloadapi.X509Source
 
- err := backoff.Retry("RunInitCommands:AcquireSource", func() error {
- log.TraceLn(cid, "RunInitCommands:AcquireSource: acquireSourceForSentinel: 000")
- _, acquired := spiffe.AcquireSourceForSentinel(ctx)
- if !acquired {
- log.TraceLn(cid, "RunInitCommands:AcquireSource: failed to acquire source.")
- if terminateAsap {
- panic("RunInitCommands:AcquireSource: failed to acquire source")
- }
+ err := backoff.Retry("RunInitCommands:AcquireSource", func() error {
+ log.TraceLn(cid, "RunInitCommands:AcquireSource: acquireSourceForSentinel: 000")
 
-  return errors.New("RunInitCommands:AcquireSource: failed to acquire source 000")
-  }
+ acq, acquired := spiffe.AcquireSourceForSentinel(ctx)
+ src = acq
 
-  return nil
- }, s)
+ if !acquired {
+  log.TraceLn(cid, "RunInitCommands:AcquireSource: failed to acquire source.")
 
- if err == nil {
- log.TraceLn(cid, "RunInitCommands:AcquireSource: got source. breaking.")
- break
+ return errors.New("RunInitCommands:AcquireSource: failed to acquire source 000")
  }
+
+ return nil
+ }, s)
+
+ if err == nil {
+ log.TraceLn(cid, "RunInitCommands:AcquireSource: got source. breaking.")
+ return src
  }
+
+ // I shouldn't be here.
+ panic("RunInitCommands:AcquireSource: failed to acquire source")
 }
diff --git a/app/sentinel/background/initialization/io.go b/app/sentinel/background/initialization/io.go
@@ -45,15 +45,9 @@ func parseCommandsFile(ctx context.Context, cid *string, scanner *bufio.Scanner)
  log.TraceLn(cid, "Before parsing commands 002")
 
  sc := entity.SentinelCommand{}
- terminateAsap := env.TerminateSentinelOnInitCommandConnectivityFailure()
 
  if scanner == nil {
- if terminateAsap {
- log.ErrorLn(cid, "RunInitCommands: error scanning commands file")
- panic("RunInitCommands: error scanning commands file")
- }
-
- return
+ panic("RunInitCommands: error scanning commands file")
  }
 
  log.TraceLn(cid, "beginning scan")
@@ -97,10 +91,8 @@ dance:
  "RunInitCommands:ProcessCommandBlock:error:",
  err.Error(),
  )
- if terminateAsap {
- panic("RunInitCommands:ProcessCommandBlock failed")
- }
  }
+
  return err
  }, s)
 
@@ -110,9 +102,10 @@ dance:
  "RunInitCommands: error processing command block: ",
  err.Error(),
  )
- if terminateAsap {
- panic("RunInitCommands: error processing command block")
- }
+
+ // If command failed, then the initialization is not totally successful.
+ // Thus, it is best to crash the container to restart the initialization.
+ panic("RunInitCommands:ProcessCommandBlock failed")
  }
 
  log.TraceLn(cid, "scanner: after delimiter")
@@ -181,8 +174,9 @@ dance:
  "RunInitCommands: Error reading initialization file: ",
  err.Error(),
  )
- if terminateAsap {
- panic("RunInitCommands: Error reading initialization file")
- }
+
+ // If command failed, then the initialization is not totally successful.
+ // Thus, it is best to crash the container to restart the initialization.
+ panic("RunInitCommands: Error in scanning the file")
  }
 }
diff --git a/app/sentinel/background/initialization/keystone.go b/app/sentinel/background/initialization/keystone.go
@@ -15,13 +15,10 @@ import (
 
  "github.com/vmware-tanzu/secrets-manager/core/backoff"
  entity "github.com/vmware-tanzu/secrets-manager/core/entity/data/v1"
- "github.com/vmware-tanzu/secrets-manager/core/env"
  log "github.com/vmware-tanzu/secrets-manager/core/log/std"
 )
 
 func markKeystone(ctx context.Context, cid *string) bool {
- terminateAsap := env.TerminateSentinelOnInitCommandConnectivityFailure()
-
  s := backoffStrategy()
  err := backoff.Retry("RunInitCommands:MarkKeystone", func() error {
  log.TraceLn(cid, "RunInitCommands:MarkKeystone: retrying with exponential backoff")
@@ -33,22 +30,13 @@ func markKeystone(ctx context.Context, cid *string) bool {
  Secret: "keystone-init",
  })
 
- if err != nil {
- if terminateAsap {
- panic("RunInitCommands: error setting keystone secret")
- }
- }
-
  return err
  }, s)
 
- if err != nil {
- log.ErrorLn(cid, "RunInitCommands: error setting keystone secret: ", err.Error())
- if terminateAsap {
- panic("RunInitCommands: error setting keystone secret")
- }
- return false
+ if err == nil {
+ return true
  }
 
- return true
+ log.ErrorLn(cid, "RunInitCommands: error setting keystone secret: ", err.Error())
+ panic("RunInitCommands: error setting keystone secret")
 }
diff --git a/app/sentinel/background/initialization/run.go b/app/sentinel/background/initialization/run.go
@@ -12,7 +12,6 @@ package initialization
 
 import (
  "context"
- "github.com/spiffe/go-spiffe/v2/workloadapi"
  "os"
  "time"
 
@@ -47,9 +46,19 @@ import (
 // If the file cannot be opened, the function logs an informational message and
 // returns early. Errors encountered while reading the file or closing it are
 // logged as errors.
-func RunInitCommands(ctx context.Context, source *workloadapi.X509Source) {
+func RunInitCommands(ctx context.Context) {
  cid := ctx.Value("correlationId").(*string)
 
+ waitInterval := env.InitCommandRunnerWaitBeforeExecIntervalForSentinel()
+ time.Sleep(waitInterval)
+
+ // Ensure that we can acquire a source before proceeding.
+ source := ensureSourceAcquisition(ctx, cid)
+
+ // Now, we are sure that we can acquire a source.
+ // Try to do a VSecM Safe API request with the source.
+ ensureApiConnectivity(ctx, cid)
+
  // No need to proceed if initialization has been completed already.
  if initCommandsExecutedAlready(ctx, source) {
  log.TraceLn(cid, "RunInitCommands: executed already. exiting")
@@ -58,12 +67,6 @@ func RunInitCommands(ctx context.Context, source *workloadapi.X509Source) {
 
  log.TraceLn(cid, "RunInitCommands: starting the init flow")
 
- // Ensure that we can acquire a source before proceeding.
- ensureSourceAcquisition(ctx, cid)
- // Now, we are sure that we can acquire a source.
- // Try to do a VSecM Safe API request with the source.
- ensureApiConnectivity(ctx, cid)
-
  // Now we know that we can establish a connection to VSecM Safe
  // and execute API requests. So, we can safely run init commands.
 
@@ -100,13 +103,14 @@ func RunInitCommands(ctx context.Context, source *workloadapi.X509Source) {
  if !success {
  log.TraceLn(cid, "RunInitCommands: failed to mark keystone. exiting")
 
- // If we cannot set the keystone secret, we should not proceed.
+ // If we cannot set the keystone secret, better to retry everything.
+ panic("RunInitCommands: failed to set keystone secret")
  return
  }
 
  // Wait before notifying Keystone. This way, if there are things that
  // take time to reconcile, they have a chance to do so.
- waitInterval := env.InitCommandRunnerWaitIntervalBeforeInitComplete()
+ waitInterval = env.InitCommandRunnerWaitIntervalBeforeInitComplete()
  time.Sleep(waitInterval)
 
  // Everything is set up.

diff --git a/app/sentinel/background/initialization/validation.go b/app/sentinel/background/initialization/validation.go
@@ -23,24 +23,24 @@ func initCommandsExecutedAlready(ctx context.Context, src *workloadapi.X509Sourc
 
  log.TraceLn(cid, "check:initCommandsExecutedAlready")
 
- s := backoffStrategy()
  initialized := false
- for {
- err := backoff.Retry("RunInitCommands:CheckConnectivity", func() error {
- i, err := safe.CheckInitialization(ctx, src)
- if err != nil {
- return err
- }
- initialized = i
- return nil
- }, s)
 
+ s := backoffStrategy()
+ err := backoff.Retry("RunInitCommands:CheckConnectivity", func() error {
+ i, err := safe.CheckInitialization(ctx, src)
  if err != nil {
- log.ErrorLn(cid, "check:backoff:error", err.Error())
- continue
+ return err
  }
 
- log.TraceLn(cid, "check:return initialized:", initialized)
+ initialized = i
+
+ return nil
+ }, s)
+
+ if err == nil {
  return initialized
  }
+
+ // I shouldn't be here.
+ panic("RunInitCommands:initCommandsExecutedAlready: failed to check command initialization")
 }