Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding 'wait not ready' to prevent premature upgrade moves #1542

Merged
merged 5 commits into from
Mar 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions pkg/clustermanager/cluster_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"reflect"
"strings"
"time"

eksdv1alpha1 "github.com/aws/eks-distro-build-tooling/release/api/v1alpha1"
Expand All @@ -29,15 +30,16 @@ import (
)

const (
maxRetries = 30
backOffPeriod = 5 * time.Second
machineMaxWait = 10 * time.Minute
machineBackoff = 1 * time.Second
machinesMinWait = 30 * time.Minute
moveCAPIWait = 15 * time.Minute
ctrlPlaneWaitStr = "60m"
etcdWaitStr = "60m"
deploymentWaitStr = "30m"
maxRetries = 30
backOffPeriod = 5 * time.Second
machineMaxWait = 10 * time.Minute
machineBackoff = 1 * time.Second
machinesMinWait = 30 * time.Minute
moveCAPIWait = 15 * time.Minute
ctrlPlaneWaitStr = "60m"
etcdWaitStr = "60m"
deploymentWaitStr = "30m"
ctrlPlaneInProgressStr = "1m"
)

type ClusterManager struct {
Expand All @@ -59,6 +61,7 @@ type ClusterClient interface {
ApplyKubeSpecFromBytesWithNamespace(ctx context.Context, cluster *types.Cluster, data []byte, namespace string) error
ApplyKubeSpecFromBytesForce(ctx context.Context, cluster *types.Cluster, data []byte) error
WaitForControlPlaneReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error
WaitForControlPlaneNotReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error
WaitForManagedExternalEtcdReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error
GetWorkloadKubeconfig(ctx context.Context, clusterName string, cluster *types.Cluster) ([]byte, error)
GetEksaGitOpsConfig(ctx context.Context, gitOpsConfigName string, kubeconfigFile string, namespace string) (*v1alpha1.GitOpsConfig, error)
Expand Down Expand Up @@ -345,7 +348,6 @@ func (c *ClusterManager) UpgradeCluster(ctx context.Context, managementCluster,
if err = c.writeCAPISpecFile(newClusterSpec.Cluster.Name, templater.AppendYamlResources(cpContent, mdContent)); err != nil {
return err
}

err = c.Retrier.Retry(
func() error {
return c.clusterClient.ApplyKubeSpecFromBytesWithNamespace(ctx, managementCluster, cpContent, constants.EksaSystemNamespace)
Expand All @@ -365,6 +367,15 @@ func (c *ClusterManager) UpgradeCluster(ctx context.Context, managementCluster,
logger.V(3).Info("External etcd is ready")
}

logger.V(3).Info("Waiting for control plane upgrade to be in progress")
err = c.clusterClient.WaitForControlPlaneNotReady(ctx, managementCluster, ctrlPlaneInProgressStr, newClusterSpec.Cluster.Name)
if err != nil {
if !strings.Contains(fmt.Sprint(err), "timed out waiting for the condition on clusters") {
return fmt.Errorf("error waiting for control plane not ready: %v", err)
} else {
logger.V(3).Info("Timed out while waiting for control plane to be in progress, likely caused by no control plane upgrade")
}
}
logger.V(3).Info("Run post control plane upgrade operations")
err = provider.RunPostControlPlaneUpgrade(ctx, currentSpec, newClusterSpec, workloadCluster, managementCluster)
if err != nil {
Expand Down
4 changes: 4 additions & 0 deletions pkg/clustermanager/cluster_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ func TestClusterManagerUpgradeWorkloadClusterSuccess(t *testing.T) {
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytesWithNamespace(tt.ctx, mCluster, test.OfType("[]uint8"), constants.EksaSystemNamespace).Times(2)
tt.mocks.provider.EXPECT().RunPostControlPlaneUpgrade(tt.ctx, tt.clusterSpec, tt.clusterSpec, wCluster, mCluster)
tt.mocks.client.EXPECT().WaitForControlPlaneReady(tt.ctx, mCluster, "60m", clusterName).MaxTimes(2)
tt.mocks.client.EXPECT().WaitForControlPlaneNotReady(tt.ctx, mCluster, "1m", clusterName)
tt.mocks.client.EXPECT().GetMachines(tt.ctx, mCluster, mCluster.Name).Return([]types.Machine{}, nil).Times(2)
tt.mocks.provider.EXPECT().MachineDeploymentsToDelete(wCluster, tt.clusterSpec, tt.clusterSpec.DeepCopy()).Return([]string{})
tt.mocks.client.EXPECT().WaitForDeployment(tt.ctx, wCluster, "30m", "Available", gomock.Any(), gomock.Any()).MaxTimes(10)
Expand Down Expand Up @@ -513,6 +514,7 @@ func TestClusterManagerUpgradeWorkloadClusterWaitForMachinesTimeout(t *testing.T
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytesWithNamespace(ctx, mCluster, test.OfType("[]uint8"), constants.EksaSystemNamespace)
tt.mocks.provider.EXPECT().RunPostControlPlaneUpgrade(tt.ctx, tt.clusterSpec, tt.clusterSpec, wCluster, mCluster)
tt.mocks.client.EXPECT().WaitForControlPlaneReady(ctx, mCluster, "60m", clusterName)
tt.mocks.client.EXPECT().WaitForControlPlaneNotReady(tt.ctx, mCluster, "1m", clusterName)
tt.mocks.writer.EXPECT().Write(clusterName+"-eks-a-cluster.yaml", gomock.Any(), gomock.Not(gomock.Nil()))
// Fail once
tt.mocks.client.EXPECT().GetMachines(ctx, mCluster, mCluster.Name).Times(1).Return(nil, errors.New("error get machines"))
Expand Down Expand Up @@ -556,6 +558,7 @@ func TestClusterManagerCreateWorkloadClusterWaitForMachinesFailedWithUnhealthyNo
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytesWithNamespace(tt.ctx, mCluster, test.OfType("[]uint8"), constants.EksaSystemNamespace)
tt.mocks.provider.EXPECT().RunPostControlPlaneUpgrade(tt.ctx, tt.clusterSpec, tt.clusterSpec, wCluster, mCluster)
tt.mocks.client.EXPECT().WaitForControlPlaneReady(tt.ctx, mCluster, "60m", clusterName).MaxTimes(5)
tt.mocks.client.EXPECT().WaitForControlPlaneNotReady(tt.ctx, mCluster, "1m", clusterName)
tt.mocks.client.EXPECT().WaitForDeployment(tt.ctx, wCluster, "30m", "Available", gomock.Any(), gomock.Any()).MaxTimes(10)
tt.mocks.writer.EXPECT().Write(clusterName+"-eks-a-cluster.yaml", gomock.Any(), gomock.Not(gomock.Nil()))
// Return a machine with no nodeRef the rest of the retries
Expand Down Expand Up @@ -584,6 +587,7 @@ func TestClusterManagerUpgradeWorkloadClusterWaitForCAPITimeout(t *testing.T) {
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytesWithNamespace(tt.ctx, mCluster, test.OfType("[]uint8"), constants.EksaSystemNamespace).Times(2)
tt.mocks.provider.EXPECT().RunPostControlPlaneUpgrade(tt.ctx, tt.clusterSpec, tt.clusterSpec, wCluster, mCluster)
tt.mocks.client.EXPECT().WaitForControlPlaneReady(tt.ctx, mCluster, "60m", clusterName).MaxTimes(2)
tt.mocks.client.EXPECT().WaitForControlPlaneNotReady(tt.ctx, mCluster, "1m", clusterName)
tt.mocks.client.EXPECT().GetMachines(tt.ctx, mCluster, mCluster.Name).Return([]types.Machine{}, nil).Times(2)
tt.mocks.provider.EXPECT().MachineDeploymentsToDelete(wCluster, tt.clusterSpec, tt.clusterSpec.DeepCopy()).Return([]string{})
tt.mocks.client.EXPECT().WaitForDeployment(tt.ctx, wCluster, "30m", "Available", gomock.Any(), gomock.Any()).Return(errors.New("time out"))
Expand Down
14 changes: 14 additions & 0 deletions pkg/clustermanager/mocks/client_and_networking.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 4 additions & 5 deletions pkg/executables/executables.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,17 @@ func execute(ctx context.Context, cli string, in []byte, args ...string) (stdout
cmd := exec.CommandContext(ctx, cli, args...)
logger.V(6).Info("Executing command", "cmd", redactCreds(cmd.String()))
cmd.Stdout = &stdout
if logger.MaxLogging() {
cmd.Stderr = os.Stderr
} else {
cmd.Stderr = &stderr
}
cmd.Stderr = &stderr
if len(in) != 0 {
cmd.Stdin = bytes.NewReader(in)
}

err = cmd.Run()
if err != nil {
if stderr.Len() > 0 {
if logger.MaxLogging() {
logger.V(logger.MaxLoggingLevel()).Info(cli, "stderr", stderr.String())
}
return stdout, errors.New(stderr.String())
} else {
if !logger.MaxLogging() {
Expand Down
4 changes: 4 additions & 0 deletions pkg/executables/kubectl.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ func (k *Kubectl) WaitForControlPlaneReady(ctx context.Context, cluster *types.C
return k.Wait(ctx, cluster.KubeconfigFile, timeout, "ControlPlaneReady", fmt.Sprintf("%s/%s", capiClustersResourceType, newClusterName), constants.EksaSystemNamespace)
}

func (k *Kubectl) WaitForControlPlaneNotReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error {
return k.Wait(ctx, cluster.KubeconfigFile, timeout, "ControlPlaneReady=false", fmt.Sprintf("%s/%s", capiClustersResourceType, newClusterName), constants.EksaSystemNamespace)
}

func (k *Kubectl) WaitForManagedExternalEtcdReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error {
return k.Wait(ctx, cluster.KubeconfigFile, timeout, "ManagedEtcdReady", fmt.Sprintf("clusters.%s/%s", clusterv1.GroupVersion.Group, newClusterName), constants.EksaSystemNamespace)
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/logger/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ func MaxLogging() bool {
return l.V(maxLogging).Enabled()
}

func MaxLoggingLevel() int {
return maxLogging
}

// Fatal is equivalent to Get().Error() followed by a call to os.Exit(1).
func Fatal(err error, msg string) {
l.Error(err, msg)
Expand Down