pkg/start: Release leader lease on graceful shutdown

So the incoming cluster-version operator doesn't need to wait for the outgoing operator's lease to expire, which can take a while [1]: I0802 10:06:01.056591 1 leaderelection.go:243] attempting to acquire leader lease openshift-cluster-version/version... ... I0802 10:07:42.632719 1 leaderelection.go:253] successfully acquired lease openshift-cluster-version/version and time out the: Cluster did not acknowledge request to upgrade in a reasonable time testcase [2]. Using ReleaseOnCancel has been the plan since 2b81f47 (cvo: Release our leader lease when we are gracefully terminated, 2019-01-16, openshift#87). I'm not clear on why it (sometimes?) doesn't work today. The discrepancy between the "exit after 2s no matter what" comment and the 5s After dates back to dbedb7a (cvo: When the CVO restarts, perform one final sync to write status, 2019-04-27, openshift#179), which bumped the After from 2s to 5s, but forgot to bump the comment. I'm removing that code here in favor of the two-minute timeout from b30aa0e (pkg/cvo/metrics: Graceful server shutdown, 2020-04-15, openshift#349). We still exit immediately on a second TERM, for folks who get impatient waiting for the graceful timeout. I'm also pushing the Background() initialization all the way up to the command-line handler, to make it more obvious that the context is scoped to the whole 'start' invocation. [1]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/25365/pull-ci-openshift-origin-master-e2e-gcp-upgrade/1289853267223777280/artifacts/e2e-gcp-upgrade/pods/openshift-cluster-version_cluster-version-operator-5b6ff896c6-57ppb_cluster-version-operator.log [2]: https://bugzilla.redhat.com/show_bug.cgi?id=1843505#c7
wking · Aug 4, 2020 · 6438ffc · 6438ffc
1 parent ed864d6
commit 6438ffc
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 63 deletions.
diff --git a/cmd/start.go b/cmd/start.go
@@ -1,6 +1,8 @@
 package main
 
 import (
+	"context"
+
 	"github.com/spf13/cobra"
 	"k8s.io/klog"
 
@@ -18,9 +20,10 @@ func init() {
 			// To help debugging, immediately log version
 			klog.Infof("%s", version.String)
 
-			if err := opts.Run(); err != nil {
+			if err := opts.Run(context.Background()); err != nil {
 				klog.Fatalf("error: %v", err)
 			}
+			klog.Info("Graceful shutdown complete.")
 		},
 	}
 

diff --git a/pkg/start/start.go b/pkg/start/start.go
@@ -97,7 +97,7 @@ func NewOptions() *Options {
 	}
 }
 
-func (o *Options) Run() error {
+func (o *Options) Run(ctx context.Context) error {
 	if o.NodeName == "" {
 		return fmt.Errorf("node-name is required")
 	}
@@ -133,29 +133,6 @@ func (o *Options) Run() error {
 		return err
 	}
 
-	// TODO: Kube 1.14 will contain a ReleaseOnCancel boolean on
-	//   LeaderElectionConfig that allows us to have the lock code
-	//   release the lease when this context is cancelled. At that
-	//   time we can remove our changes to OnStartedLeading.
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-	ch := make(chan os.Signal, 1)
-	defer func() { signal.Stop(ch) }()
-	signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
-	go func() {
-		sig := <-ch
-		klog.Infof("Shutting down due to %s", sig)
-		cancel()
-
-		// exit after 2s no matter what
-		select {
-		case <-time.After(5 * time.Second):
-			klog.Fatalf("Exiting")
-		case <-ch:
-			klog.Fatalf("Received shutdown signal twice, exiting")
-		}
-	}()
-
 	o.run(ctx, controllerCtx, lock)
 	return nil
 }
@@ -187,6 +164,18 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 	defer runCancel()
 	shutdownContext, shutdownCancel := context.WithCancel(ctx)
 	defer shutdownCancel()
+
+	ch := make(chan os.Signal, 1)
+	defer func() { signal.Stop(ch) }()
+	signal.Notify(ch, os.Interrupt, syscall.SIGTERM)
+	go func() {
+		sig := <-ch
+		klog.Infof("Shutting down due to %s", sig)
+		runCancel()
+		sig <- ch
+		klog.Fatalf("Received shutdown signal twice, exiting: %s", sig)
+	}()
+
 	errorChannel := make(chan error, 1)
 	errorChannelCount := 0
 	if o.ListenAddr != "" {
@@ -204,46 +193,19 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 		}()
 	}
 
-	exit := make(chan struct{})
-	exitClose := sync.Once{}
-
-	// TODO: when we switch to graceful lock shutdown, this can be
-	// moved back inside RunOrDie
-	// TODO: properly wire ctx here
-	go leaderelection.RunOrDie(context.TODO(), leaderelection.LeaderElectionConfig{
-		Lock:          lock,
-		LeaseDuration: leaseDuration,
-		RenewDeadline: renewDeadline,
-		RetryPeriod:   retryPeriod,
+	go leaderelection.RunOrDie(runContext, leaderelection.LeaderElectionConfig{
+		Lock:            lock,
+		ReleaseOnCancel: true,
+		LeaseDuration:   leaseDuration,
+		RenewDeadline:   renewDeadline,
+		RetryPeriod:     retryPeriod,
 		Callbacks: leaderelection.LeaderCallbacks{
 			OnStartedLeading: func(localCtx context.Context) {
 				controllerCtx.Start(runContext)
-				select {
-				case <-runContext.Done():
-					// WARNING: this is not completely safe until we have Kube 1.14 and ReleaseOnCancel
-					//   and client-go ContextCancelable, which allows us to block new API requests before
-					//   we step down. However, the CVO isn't that sensitive to races and can tolerate
-					//   brief overlap.
-					klog.Infof("Stepping down as leader")
-					// give the controllers some time to shut down
-					time.Sleep(100 * time.Millisecond)
-					// if we still hold the leader lease, clear the owner identity (other lease watchers
-					// still have to wait for expiration) like the new ReleaseOnCancel code will do.
-					if err := lock.Update(localCtx, resourcelock.LeaderElectionRecord{}); err == nil {
-						// if we successfully clear the owner identity, we can safely delete the record
-						if err := lock.Client.ConfigMaps(lock.ConfigMapMeta.Namespace).Delete(localCtx, lock.ConfigMapMeta.Name, metav1.DeleteOptions{}); err != nil {
-							klog.Warningf("Unable to step down cleanly: %v", err)
-						}
-					}
-					klog.Infof("Finished shutdown")
-					exitClose.Do(func() { close(exit) })
-				case <-localCtx.Done():
-					// we will exit in OnStoppedLeading
-				}
 			},
 			OnStoppedLeading: func() {
-				klog.Warning("leaderelection lost")
-				exitClose.Do(func() { close(exit) })
+				klog.Info("Stopped leading; shutting down.")
+				runCancel()
 			},
 		},
 	})
@@ -270,13 +232,11 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock *resourc
 				errorChannelCount--
 				if err != nil {
 					klog.Error(err)
-					runCancel()
 				}
 			}
 		}
 	}
-
-	<-exit
+	klog.Info("Finished collecting operator goroutines.")
 }
 
 // createResourceLock initializes the lock.