Skip to content

Commit

Permalink
cvo: When the CVO restarts, perform one final sync to write status
Browse files Browse the repository at this point in the history
When we upgrade the CVO causes itself to reboot by updating the
deployment. The CVO gets signalled with SIGTERM and then releases
the leader lease. However, there is no guarantee the latest status
of the CVO has been flushed to the cluster version object which
can mean the "verified: true" flag that the sync worker calculates
when it retrieves the payload doesn't get written. The new CVO pod
loads from the payload and so doesn't have the verified flag.

While in the future we may want to completely decouple verification
from payload retrieval (background worker that verifies available
updates as well as checks historical records), for now we need to
ensure the loaded state is persisted to the CV. Since there may be
useful human information available about the payload that a failed
new CVO pod might not get a chance to write, alter the CVO sync
loop to perform one final status sync during shutdown, and increase
the amount of time we wait before hard shutdown to 5s to give it
more room to happen.
  • Loading branch information
smarterclayton committed Apr 29, 2019
1 parent 51fef0b commit dbedb7a
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 4 deletions.
20 changes: 17 additions & 3 deletions pkg/cvo/cvo.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,11 @@ func (optr *Operator) InitializeFromPayload(restConfig *rest.Config, burstRestCo
// Run runs the cluster version operator until stopCh is completed. Workers is ignored for now.
func (optr *Operator) Run(ctx context.Context, workers int) {
defer utilruntime.HandleCrash()
defer optr.queue.ShutDown()
// TODO: when Kube 77170 is fixed we can remove the use of the once here
var shutdownOnce sync.Once
defer shutdownOnce.Do(func() { optr.queue.ShutDown() })
stopCh := ctx.Done()
workerStopCh := make(chan struct{})

glog.Infof("Starting ClusterVersionOperator with minimum reconcile period %s", optr.minimumUpdateCheckInterval)
defer glog.Info("Shutting down ClusterVersionOperator")
Expand All @@ -243,11 +246,22 @@ func (optr *Operator) Run(ctx context.Context, workers int) {
// start the config sync loop, and have it notify the queue when new status is detected
go runThrottledStatusNotifier(stopCh, optr.statusInterval, 2, optr.configSync.StatusCh(), func() { optr.queue.Add(optr.queueKey()) })
go optr.configSync.Start(ctx, 16)

go wait.Until(func() { optr.worker(optr.queue, optr.sync) }, time.Second, stopCh)
go wait.Until(func() { optr.worker(optr.availableUpdatesQueue, optr.availableUpdatesSync) }, time.Second, stopCh)
go wait.Until(func() {
defer close(workerStopCh)

// run the worker, then when the queue is closed sync one final time to flush any pending status
optr.worker(optr.queue, optr.sync)
if err := optr.sync(optr.queueKey()); err != nil {
utilruntime.HandleError(fmt.Errorf("unable to perform final sync: %v", err))
}
}, time.Second, stopCh)

<-stopCh

// stop the queue, then wait for the worker to exit
shutdownOnce.Do(func() { optr.queue.ShutDown() })
<-workerStopCh
}

func (optr *Operator) queueKey() string {
Expand Down
2 changes: 1 addition & 1 deletion pkg/start/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ func (o *Options) Run() error {

// exit after 2s no matter what
select {
case <-time.After(2 * time.Second):
case <-time.After(5 * time.Second):
glog.Fatalf("Exiting")
case <-ch:
glog.Fatalf("Received shutdown signal twice, exiting")
Expand Down

0 comments on commit dbedb7a

Please sign in to comment.