cvo: When the CVO restarts, perform one final sync to write status

When we upgrade the CVO causes itself to reboot by updating the deployment. The CVO gets signalled with SIGTERM and then releases the leader lease. However, there is no guarantee the latest status of the CVO has been flushed to the cluster version object which can mean the "verified: true" flag that the sync worker calculates when it retrieves the payload doesn't get written. The new CVO pod loads from the payload and so doesn't have the verified flag. While in the future we may want to completely decouple verification from payload retrieval (background worker that verifies available updates as well as checks historical records), for now we need to ensure the loaded state is persisted to the CV. Since there may be useful human information available about the payload that a failed new CVO pod might not get a chance to write, alter the CVO sync loop to perform one final status sync during shutdown, and increase the amount of time we wait before hard shutdown to 5s to give it more room to happen.
wking · Apr 29, 2019 · dbedb7a · dbedb7a
1 parent 51fef0b
commit dbedb7a
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 4 deletions.
diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go
@@ -226,8 +226,11 @@ func (optr *Operator) InitializeFromPayload(restConfig *rest.Config, burstRestCo
 // Run runs the cluster version operator until stopCh is completed. Workers is ignored for now.
 func (optr *Operator) Run(ctx context.Context, workers int) {
 	defer utilruntime.HandleCrash()
-	defer optr.queue.ShutDown()
+	// TODO: when Kube 77170 is fixed we can remove the use of the once here
+	var shutdownOnce sync.Once
+	defer shutdownOnce.Do(func() { optr.queue.ShutDown() })
 	stopCh := ctx.Done()
+	workerStopCh := make(chan struct{})
 
 	glog.Infof("Starting ClusterVersionOperator with minimum reconcile period %s", optr.minimumUpdateCheckInterval)
 	defer glog.Info("Shutting down ClusterVersionOperator")
@@ -243,11 +246,22 @@ func (optr *Operator) Run(ctx context.Context, workers int) {
 	// start the config sync loop, and have it notify the queue when new status is detected
 	go runThrottledStatusNotifier(stopCh, optr.statusInterval, 2, optr.configSync.StatusCh(), func() { optr.queue.Add(optr.queueKey()) })
 	go optr.configSync.Start(ctx, 16)
-
-	go wait.Until(func() { optr.worker(optr.queue, optr.sync) }, time.Second, stopCh)
 	go wait.Until(func() { optr.worker(optr.availableUpdatesQueue, optr.availableUpdatesSync) }, time.Second, stopCh)
+	go wait.Until(func() {
+		defer close(workerStopCh)
+
+		// run the worker, then when the queue is closed sync one final time to flush any pending status
+		optr.worker(optr.queue, optr.sync)
+		if err := optr.sync(optr.queueKey()); err != nil {
+			utilruntime.HandleError(fmt.Errorf("unable to perform final sync: %v", err))
+		}
+	}, time.Second, stopCh)
 
 	<-stopCh
+
+	// stop the queue, then wait for the worker to exit
+	shutdownOnce.Do(func() { optr.queue.ShutDown() })
+	<-workerStopCh
 }
 
 func (optr *Operator) queueKey() string {

diff --git a/pkg/start/start.go b/pkg/start/start.go
@@ -132,7 +132,7 @@ func (o *Options) Run() error {
 
 		// exit after 2s no matter what
 		select {
-		case <-time.After(2 * time.Second):
+		case <-time.After(5 * time.Second):
 			glog.Fatalf("Exiting")
 		case <-ch:
 			glog.Fatalf("Received shutdown signal twice, exiting")