From 440dac925d8e80573a5bbec2632a7f4d8fd6d031 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Wed, 17 Apr 2019 09:35:41 -0700 Subject: [PATCH] pkg/start/start: Drop bootstrapPodsRunningTimeout And plumb through contexts from runCmdStart so we can drop the context.TODO() calls. bootstrapPodsRunningTimeout was added in d07548e31c (Add --tear-down-event flag to delay tear down, 2019-01-24, #9), although Stefan had no strong opinion on them then [1]. But as it stands, a hung pod creates loops like [2]: $ tar xf log-bundle.tar.gz $ cd bootstrap/journals $ grep 'Started Bootstrap\|Error: error while checking pod status' bootkube.log Apr 16 17:46:23 ip-10-0-4-87 systemd[1]: Started Bootstrap a Kubernetes cluster. Apr 16 18:12:41 ip-10-0-4-87 bootkube.sh[1510]: Error: error while checking pod status: timed out waiting for the condition Apr 16 18:12:41 ip-10-0-4-87 bootkube.sh[1510]: Error: error while checking pod status: timed out waiting for the condition Apr 16 18:12:46 ip-10-0-4-87 systemd[1]: Started Bootstrap a Kubernetes cluster. Apr 16 18:33:02 ip-10-0-4-87 bootkube.sh[11418]: Error: error while checking pod status: timed out waiting for the condition Apr 16 18:33:02 ip-10-0-4-87 bootkube.sh[11418]: Error: error while checking pod status: timed out waiting for the condition Apr 16 18:33:07 ip-10-0-4-87 systemd[1]: Started Bootstrap a Kubernetes cluster. Instead of having systemd keep kicking bootkube.sh (which in turn keeps launching cluster-bootstrap), removing this timeout will just leave cluster-bootstrap running while folks gather logs from the broken cluster. And the less spurious-restart noise there is in those logs, the easier it will be to find what actually broke. [1]: https://github.com/openshift/cluster-bootstrap/pull/9#discussion_r250929330 [2]: https://bugzilla.redhat.com/show_bug.cgi?id=1700504#c14 --- cmd/cluster-bootstrap/start.go | 5 ++++- pkg/start/start.go | 17 ++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/cmd/cluster-bootstrap/start.go b/cmd/cluster-bootstrap/start.go index 3934f7e65..bbee1e379 100644 --- a/cmd/cluster-bootstrap/start.go +++ b/cmd/cluster-bootstrap/start.go @@ -1,6 +1,7 @@ package main import ( + "context" "errors" "strings" @@ -47,6 +48,8 @@ func init() { } func runCmdStart(cmd *cobra.Command, args []string) error { + ctx := context.Background() + podPrefixes, err := parsePodPrefixes(startOpts.requiredPodClauses) if err != nil { return err @@ -64,7 +67,7 @@ func runCmdStart(cmd *cobra.Command, args []string) error { return err } - return bk.Run() + return bk.Run(ctx) } // parsePodPrefixes parses / or :/|... into a map with diff --git a/pkg/start/start.go b/pkg/start/start.go index d9e473dd7..664af148c 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -22,8 +22,6 @@ import ( ) const ( - // how long we wait until the bootstrap pods to be running - bootstrapPodsRunningTimeout = 20 * time.Minute // how long we wait until the assets must all be created assetsCreatedTimeout = 60 * time.Minute ) @@ -57,7 +55,7 @@ func NewStartCommand(config Config) (*startCommand, error) { }, nil } -func (b *startCommand) Run() error { +func (b *startCommand) Run(ctx context.Context) error { restConfig, err := clientcmd.BuildConfigFromFlags("", filepath.Join(b.assetDir, assetPathAdminKubeConfig)) if err != nil { return err @@ -121,13 +119,12 @@ func (b *startCommand) Run() error { }() return &done } - ctx, cancel := context.WithTimeout(context.TODO(), bootstrapPodsRunningTimeout) + assetContext, cancel := context.WithTimeout(ctx, assetsCreatedTimeout) defer cancel() - assetsDone := createAssetsInBackground(ctx, cancel, localClientConfig) + assetsDone := createAssetsInBackground(assetContext, cancel, localClientConfig) if err = waitUntilPodsRunning(ctx, client, b.requiredPodPrefixes); err != nil { return err } - cancel() assetsDone.Wait() // notify installer that we are ready to tear down the temporary bootstrap control plane @@ -137,14 +134,12 @@ func (b *startCommand) Run() error { } // continue with assets - ctx, cancel = context.WithTimeout(context.Background(), assetsCreatedTimeout) - defer cancel() if b.earlyTearDown { // switch over to ELB client and continue with the assets - assetsDone = createAssetsInBackground(ctx, cancel, restConfig) + assetsDone = createAssetsInBackground(assetContext, cancel, restConfig) } else { // we don't tear down the local control plane early. So we can keep using it and enjoy the speed up. - assetsDone = createAssetsInBackground(ctx, cancel, localClientConfig) + assetsDone = createAssetsInBackground(assetContext, cancel, localClientConfig) } // optionally wait for tear down event coming from the installer. This is necessary to @@ -155,7 +150,7 @@ func (b *startCommand) Run() error { return fmt.Errorf("tear down event name of format / expected, got: %q", b.waitForTearDownEvent) } ns, name := ss[0], ss[1] - if err := waitForEvent(context.TODO(), client, ns, name); err != nil { + if err := waitForEvent(ctx, client, ns, name); err != nil { return err } UserOutput("Got %s event.", b.waitForTearDownEvent)