elastic · blakerouse · Apr 28, 2021 · Apr 27, 2021 · Apr 27, 2021 · Apr 28, 2021
@@ -56,6 +56,7 @@
 - Fix issue with status and inspect inside of container {pull}25204[25204]
 - Remove FLEET_SERVER_POLICY_NAME env variable as it was not used {pull}25149[25149]
 - Reduce log level for listener cleanup to debug {pull}25274
+- Delay the restart of application when a status report of failure is given {pull}25339[25339]
 
 ==== New features
 

@@ -613,13 +613,6 @@ func waitForFleetServer(ctx context.Context, agentSubproc <-chan *os.ProcessStat
 				}
 				resChan <- waitResult{enrollmentToken: token}
 				break
-			} else if app.Status == proto.Status_FAILED {
-				// app completely failed; exit now
-				if app.Message != "" {
-					log.Infof("Fleet Server - %s", app.Message)
-				}
-				resChan <- waitResult{err: errors.New(app.Message)}
-				break
 			}
 			if app.Message != "" {
 				appMsg := fmt.Sprintf("Fleet Server - %s", app.Message)

@@ -58,7 +58,9 @@ type Application struct {
 
 	logger *logger.Logger
 
-	appLock sync.Mutex
+	appLock          sync.Mutex
+	restartCanceller context.CancelFunc
+	restartConfig    map[string]interface{}
 }
 
 // ArgsDecorator decorates arguments before calling an application

@@ -5,7 +5,9 @@
 package process
 
 import (
+	"context"
 	"fmt"
+	"time"
 
 	"gopkg.in/yaml.v2"
 
@@ -15,6 +17,11 @@ import (
 	"github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/core/state"
 )
 
+const (
+	// FailedRestartTimeout is the amount of time an Application can sit in Failed status before it is restarted.
+	FailedRestartTimeout = 10 * time.Second
+)
+
 // OnStatusChange is the handler called by the GRPC server code.
 //
 // It updates the status of the application and handles restarting the application if needed.
@@ -35,21 +42,74 @@ func (a *Application) OnStatusChange(s *server.ApplicationState, status proto.St
 			return
 		}
 
-		// kill the process
-		if a.state.ProcessInfo != nil {
-			_ = a.state.ProcessInfo.Process.Kill()
-			a.state.ProcessInfo = nil
-		}
-		ctx := a.startContext
-		tag := a.tag
-
 		// it was marshalled to pass into the state, so unmarshall will always succeed
 		var cfg map[string]interface{}
 		_ = yaml.Unmarshal([]byte(s.Config()), &cfg)
 
-		err := a.start(ctx, tag, cfg)
-		if err != nil {
-			a.setState(state.Crashed, fmt.Sprintf("failed to restart: %s", err), nil)
+		// start the failed timer
+		a.startFailedTimer(cfg)
+	} else {
+		a.stopFailedTimer()
+	}
+}
+
+// startFailedTimer starts a timer that will restart the application if it doesn't exit failed after a period of time.
+//
+// This does not grab the appLock, that must be managed by the caller.
+func (a *Application) startFailedTimer(cfg map[string]interface{}) {
+	if a.restartCanceller != nil {
+		// already have running failed timer; just update config
+		a.restartConfig = cfg
+		return
+	}
+
+	ctx, cancel := context.WithCancel(a.startContext)
+	a.restartCanceller = cancel
+	a.restartConfig = cfg
+	t := time.NewTimer(FailedRestartTimeout)
+	go func() {
+		defer func() {
+			a.appLock.Lock()
+			a.restartCanceller = nil
+			a.restartConfig = nil
+			a.appLock.Unlock()
+		}()
+
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			a.restart(a.restartConfig)
 		}
+	}()
+}
+
+// stopFailedTimer stops the timer that would restart the application from reporting failure.
+//
+// This does not grab the appLock, that must be managed by the caller.
+func (a *Application) stopFailedTimer() {
+	if a.restartCanceller == nil {
+		return
+	}
+	a.restartCanceller()
+	a.restartCanceller = nil
+}
+
+// restart restarts the application
+func (a *Application) restart(cfg map[string]interface{}) {
+	a.appLock.Lock()
+	defer a.appLock.Unlock()
+
+	// kill the process
+	if a.state.ProcessInfo != nil {
+		_ = a.state.ProcessInfo.Process.Kill()
+		a.state.ProcessInfo = nil
+	}
+	ctx := a.startContext
+	tag := a.tag
+
+	err := a.start(ctx, tag, cfg)
+	if err != nil {
+		a.setState(state.Crashed, fmt.Sprintf("failed to restart: %s", err), nil)
 	}
 }