From 440dac925d8e80573a5bbec2632a7f4d8fd6d031 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Wed, 17 Apr 2019 09:35:41 -0700
Subject: [PATCH] pkg/start/start: Drop bootstrapPodsRunningTimeout

And plumb through contexts from runCmdStart so we can drop the
context.TODO() calls.

bootstrapPodsRunningTimeout was added in d07548e31c (Add
--tear-down-event flag to delay tear down, 2019-01-24, #9), although
Stefan had no strong opinion on them then [1].  But as it stands, a
hung pod creates loops like [2]:

  $ tar xf log-bundle.tar.gz
  $ cd bootstrap/journals
  $ grep 'Started Bootstrap\|Error: error while checking pod status' bootkube.log
  Apr 16 17:46:23 ip-10-0-4-87 systemd[1]: Started Bootstrap a Kubernetes cluster.
  Apr 16 18:12:41 ip-10-0-4-87 bootkube.sh[1510]: Error: error while checking pod status: timed out waiting for the condition
  Apr 16 18:12:41 ip-10-0-4-87 bootkube.sh[1510]: Error: error while checking pod status: timed out waiting for the condition
  Apr 16 18:12:46 ip-10-0-4-87 systemd[1]: Started Bootstrap a Kubernetes cluster.
  Apr 16 18:33:02 ip-10-0-4-87 bootkube.sh[11418]: Error: error while checking pod status: timed out waiting for the condition
  Apr 16 18:33:02 ip-10-0-4-87 bootkube.sh[11418]: Error: error while checking pod status: timed out waiting for the condition
  Apr 16 18:33:07 ip-10-0-4-87 systemd[1]: Started Bootstrap a Kubernetes cluster.

Instead of having systemd keep kicking bootkube.sh (which in turn
keeps launching cluster-bootstrap), removing this timeout will just
leave cluster-bootstrap running while folks gather logs from the
broken cluster.  And the less spurious-restart noise there is in those
logs, the easier it will be to find what actually broke.

[1]: https://github.com/openshift/cluster-bootstrap/pull/9#discussion_r250929330
[2]: https://bugzilla.redhat.com/show_bug.cgi?id=1700504#c14
---
 cmd/cluster-bootstrap/start.go |  5 ++++-
 pkg/start/start.go             | 17 ++++++-----------
 2 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/cmd/cluster-bootstrap/start.go b/cmd/cluster-bootstrap/start.go
index 3934f7e65..bbee1e379 100644
--- a/cmd/cluster-bootstrap/start.go
+++ b/cmd/cluster-bootstrap/start.go
@@ -1,6 +1,7 @@
 package main
 
 import (
+	"context"
 	"errors"
 	"strings"
 
@@ -47,6 +48,8 @@ func init() {
 }
 
 func runCmdStart(cmd *cobra.Command, args []string) error {
+	ctx := context.Background()
+
 	podPrefixes, err := parsePodPrefixes(startOpts.requiredPodClauses)
 	if err != nil {
 		return err
@@ -64,7 +67,7 @@ func runCmdStart(cmd *cobra.Command, args []string) error {
 		return err
 	}
 
-	return bk.Run()
+	return bk.Run(ctx)
 }
 
 // parsePodPrefixes parses <ns>/<pod-prefix> or <desc>:<ns>/<pod-prefix>|... into a map with
diff --git a/pkg/start/start.go b/pkg/start/start.go
index d9e473dd7..664af148c 100644
--- a/pkg/start/start.go
+++ b/pkg/start/start.go
@@ -22,8 +22,6 @@ import (
 )
 
 const (
-	// how long we wait until the bootstrap pods to be running
-	bootstrapPodsRunningTimeout = 20 * time.Minute
 	// how long we wait until the assets must all be created
 	assetsCreatedTimeout = 60 * time.Minute
 )
@@ -57,7 +55,7 @@ func NewStartCommand(config Config) (*startCommand, error) {
 	}, nil
 }
 
-func (b *startCommand) Run() error {
+func (b *startCommand) Run(ctx context.Context) error {
 	restConfig, err := clientcmd.BuildConfigFromFlags("", filepath.Join(b.assetDir, assetPathAdminKubeConfig))
 	if err != nil {
 		return err
@@ -121,13 +119,12 @@ func (b *startCommand) Run() error {
 		}()
 		return &done
 	}
-	ctx, cancel := context.WithTimeout(context.TODO(), bootstrapPodsRunningTimeout)
+	assetContext, cancel := context.WithTimeout(ctx, assetsCreatedTimeout)
 	defer cancel()
-	assetsDone := createAssetsInBackground(ctx, cancel, localClientConfig)
+	assetsDone := createAssetsInBackground(assetContext, cancel, localClientConfig)
 	if err = waitUntilPodsRunning(ctx, client, b.requiredPodPrefixes); err != nil {
 		return err
 	}
-	cancel()
 	assetsDone.Wait()
 
 	// notify installer that we are ready to tear down the temporary bootstrap control plane
@@ -137,14 +134,12 @@ func (b *startCommand) Run() error {
 	}
 
 	// continue with assets
-	ctx, cancel = context.WithTimeout(context.Background(), assetsCreatedTimeout)
-	defer cancel()
 	if b.earlyTearDown {
 		// switch over to ELB client and continue with the assets
-		assetsDone = createAssetsInBackground(ctx, cancel, restConfig)
+		assetsDone = createAssetsInBackground(assetContext, cancel, restConfig)
 	} else {
 		// we don't tear down the local control plane early. So we can keep using it and enjoy the speed up.
-		assetsDone = createAssetsInBackground(ctx, cancel, localClientConfig)
+		assetsDone = createAssetsInBackground(assetContext, cancel, localClientConfig)
 	}
 
 	// optionally wait for tear down event coming from the installer. This is necessary to
@@ -155,7 +150,7 @@ func (b *startCommand) Run() error {
 			return fmt.Errorf("tear down event name of format <namespace>/<event-name> expected, got: %q", b.waitForTearDownEvent)
 		}
 		ns, name := ss[0], ss[1]
-		if err := waitForEvent(context.TODO(), client, ns, name); err != nil {
+		if err := waitForEvent(ctx, client, ns, name); err != nil {
 			return err
 		}
 		UserOutput("Got %s event.", b.waitForTearDownEvent)