diff --git a/pkg/clustermanager/cluster_manager.go b/pkg/clustermanager/cluster_manager.go index 64ca06bd6a19..96509df1e7c7 100644 --- a/pkg/clustermanager/cluster_manager.go +++ b/pkg/clustermanager/cluster_manager.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "reflect" + "strings" "time" eksdv1alpha1 "github.com/aws/eks-distro-build-tooling/release/api/v1alpha1" @@ -29,15 +30,16 @@ import ( ) const ( - maxRetries = 30 - backOffPeriod = 5 * time.Second - machineMaxWait = 10 * time.Minute - machineBackoff = 1 * time.Second - machinesMinWait = 30 * time.Minute - moveCAPIWait = 15 * time.Minute - ctrlPlaneWaitStr = "60m" - etcdWaitStr = "60m" - deploymentWaitStr = "30m" + maxRetries = 30 + backOffPeriod = 5 * time.Second + machineMaxWait = 10 * time.Minute + machineBackoff = 1 * time.Second + machinesMinWait = 30 * time.Minute + moveCAPIWait = 15 * time.Minute + ctrlPlaneWaitStr = "60m" + etcdWaitStr = "60m" + deploymentWaitStr = "30m" + ctrlPlaneInProgressStr = "1m" ) type ClusterManager struct { @@ -59,6 +61,7 @@ type ClusterClient interface { ApplyKubeSpecFromBytesWithNamespace(ctx context.Context, cluster *types.Cluster, data []byte, namespace string) error ApplyKubeSpecFromBytesForce(ctx context.Context, cluster *types.Cluster, data []byte) error WaitForControlPlaneReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error + WaitForControlPlaneNotReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error WaitForManagedExternalEtcdReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error GetWorkloadKubeconfig(ctx context.Context, clusterName string, cluster *types.Cluster) ([]byte, error) GetEksaGitOpsConfig(ctx context.Context, gitOpsConfigName string, kubeconfigFile string, namespace string) (*v1alpha1.GitOpsConfig, error) @@ -345,7 +348,6 @@ func (c *ClusterManager) UpgradeCluster(ctx context.Context, managementCluster, if err = c.writeCAPISpecFile(newClusterSpec.Cluster.Name, templater.AppendYamlResources(cpContent, mdContent)); err != nil { return err } - err = c.Retrier.Retry( func() error { return c.clusterClient.ApplyKubeSpecFromBytesWithNamespace(ctx, managementCluster, cpContent, constants.EksaSystemNamespace) @@ -365,6 +367,15 @@ func (c *ClusterManager) UpgradeCluster(ctx context.Context, managementCluster, logger.V(3).Info("External etcd is ready") } + logger.V(3).Info("Waiting for control plane upgrade to be in progress") + err = c.clusterClient.WaitForControlPlaneNotReady(ctx, managementCluster, ctrlPlaneInProgressStr, newClusterSpec.Cluster.Name) + if err != nil { + if !strings.Contains(fmt.Sprint(err), "timed out waiting for the condition on clusters") { + return fmt.Errorf("error waiting for control plane not ready: %v", err) + } else { + logger.V(3).Info("Timed out while waiting for control plane to be in progress, likely caused by no control plane upgrade") + } + } logger.V(3).Info("Run post control plane upgrade operations") err = provider.RunPostControlPlaneUpgrade(ctx, currentSpec, newClusterSpec, workloadCluster, managementCluster) if err != nil { diff --git a/pkg/clustermanager/cluster_manager_test.go b/pkg/clustermanager/cluster_manager_test.go index a17b08b7f945..982113db7903 100644 --- a/pkg/clustermanager/cluster_manager_test.go +++ b/pkg/clustermanager/cluster_manager_test.go @@ -479,6 +479,7 @@ func TestClusterManagerUpgradeWorkloadClusterSuccess(t *testing.T) { tt.mocks.client.EXPECT().ApplyKubeSpecFromBytesWithNamespace(tt.ctx, mCluster, test.OfType("[]uint8"), constants.EksaSystemNamespace).Times(2) tt.mocks.provider.EXPECT().RunPostControlPlaneUpgrade(tt.ctx, tt.clusterSpec, tt.clusterSpec, wCluster, mCluster) tt.mocks.client.EXPECT().WaitForControlPlaneReady(tt.ctx, mCluster, "60m", clusterName).MaxTimes(2) + tt.mocks.client.EXPECT().WaitForControlPlaneNotReady(tt.ctx, mCluster, "1m", clusterName) tt.mocks.client.EXPECT().GetMachines(tt.ctx, mCluster, mCluster.Name).Return([]types.Machine{}, nil).Times(2) tt.mocks.provider.EXPECT().MachineDeploymentsToDelete(wCluster, tt.clusterSpec, tt.clusterSpec.DeepCopy()).Return([]string{}) tt.mocks.client.EXPECT().WaitForDeployment(tt.ctx, wCluster, "30m", "Available", gomock.Any(), gomock.Any()).MaxTimes(10) @@ -513,6 +514,7 @@ func TestClusterManagerUpgradeWorkloadClusterWaitForMachinesTimeout(t *testing.T tt.mocks.client.EXPECT().ApplyKubeSpecFromBytesWithNamespace(ctx, mCluster, test.OfType("[]uint8"), constants.EksaSystemNamespace) tt.mocks.provider.EXPECT().RunPostControlPlaneUpgrade(tt.ctx, tt.clusterSpec, tt.clusterSpec, wCluster, mCluster) tt.mocks.client.EXPECT().WaitForControlPlaneReady(ctx, mCluster, "60m", clusterName) + tt.mocks.client.EXPECT().WaitForControlPlaneNotReady(tt.ctx, mCluster, "1m", clusterName) tt.mocks.writer.EXPECT().Write(clusterName+"-eks-a-cluster.yaml", gomock.Any(), gomock.Not(gomock.Nil())) // Fail once tt.mocks.client.EXPECT().GetMachines(ctx, mCluster, mCluster.Name).Times(1).Return(nil, errors.New("error get machines")) @@ -556,6 +558,7 @@ func TestClusterManagerCreateWorkloadClusterWaitForMachinesFailedWithUnhealthyNo tt.mocks.client.EXPECT().ApplyKubeSpecFromBytesWithNamespace(tt.ctx, mCluster, test.OfType("[]uint8"), constants.EksaSystemNamespace) tt.mocks.provider.EXPECT().RunPostControlPlaneUpgrade(tt.ctx, tt.clusterSpec, tt.clusterSpec, wCluster, mCluster) tt.mocks.client.EXPECT().WaitForControlPlaneReady(tt.ctx, mCluster, "60m", clusterName).MaxTimes(5) + tt.mocks.client.EXPECT().WaitForControlPlaneNotReady(tt.ctx, mCluster, "1m", clusterName) tt.mocks.client.EXPECT().WaitForDeployment(tt.ctx, wCluster, "30m", "Available", gomock.Any(), gomock.Any()).MaxTimes(10) tt.mocks.writer.EXPECT().Write(clusterName+"-eks-a-cluster.yaml", gomock.Any(), gomock.Not(gomock.Nil())) // Return a machine with no nodeRef the rest of the retries @@ -584,6 +587,7 @@ func TestClusterManagerUpgradeWorkloadClusterWaitForCAPITimeout(t *testing.T) { tt.mocks.client.EXPECT().ApplyKubeSpecFromBytesWithNamespace(tt.ctx, mCluster, test.OfType("[]uint8"), constants.EksaSystemNamespace).Times(2) tt.mocks.provider.EXPECT().RunPostControlPlaneUpgrade(tt.ctx, tt.clusterSpec, tt.clusterSpec, wCluster, mCluster) tt.mocks.client.EXPECT().WaitForControlPlaneReady(tt.ctx, mCluster, "60m", clusterName).MaxTimes(2) + tt.mocks.client.EXPECT().WaitForControlPlaneNotReady(tt.ctx, mCluster, "1m", clusterName) tt.mocks.client.EXPECT().GetMachines(tt.ctx, mCluster, mCluster.Name).Return([]types.Machine{}, nil).Times(2) tt.mocks.provider.EXPECT().MachineDeploymentsToDelete(wCluster, tt.clusterSpec, tt.clusterSpec.DeepCopy()).Return([]string{}) tt.mocks.client.EXPECT().WaitForDeployment(tt.ctx, wCluster, "30m", "Available", gomock.Any(), gomock.Any()).Return(errors.New("time out")) diff --git a/pkg/clustermanager/mocks/client_and_networking.go b/pkg/clustermanager/mocks/client_and_networking.go index e8e666d2750a..3496fe3cf825 100644 --- a/pkg/clustermanager/mocks/client_and_networking.go +++ b/pkg/clustermanager/mocks/client_and_networking.go @@ -524,6 +524,20 @@ func (mr *MockClusterClientMockRecorder) ValidateWorkerNodes(arg0, arg1, arg2 in return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ValidateWorkerNodes", reflect.TypeOf((*MockClusterClient)(nil).ValidateWorkerNodes), arg0, arg1, arg2) } +// WaitForControlPlaneNotReady mocks base method. +func (m *MockClusterClient) WaitForControlPlaneNotReady(arg0 context.Context, arg1 *types.Cluster, arg2, arg3 string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "WaitForControlPlaneNotReady", arg0, arg1, arg2, arg3) + ret0, _ := ret[0].(error) + return ret0 +} + +// WaitForControlPlaneNotReady indicates an expected call of WaitForControlPlaneNotReady. +func (mr *MockClusterClientMockRecorder) WaitForControlPlaneNotReady(arg0, arg1, arg2, arg3 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WaitForControlPlaneNotReady", reflect.TypeOf((*MockClusterClient)(nil).WaitForControlPlaneNotReady), arg0, arg1, arg2, arg3) +} + // WaitForControlPlaneReady mocks base method. func (m *MockClusterClient) WaitForControlPlaneReady(arg0 context.Context, arg1 *types.Cluster, arg2, arg3 string) error { m.ctrl.T.Helper() diff --git a/pkg/executables/executables.go b/pkg/executables/executables.go index 48ea449ba9ee..e4cbf7c5492a 100644 --- a/pkg/executables/executables.go +++ b/pkg/executables/executables.go @@ -84,11 +84,7 @@ func execute(ctx context.Context, cli string, in []byte, args ...string) (stdout cmd := exec.CommandContext(ctx, cli, args...) logger.V(6).Info("Executing command", "cmd", redactCreds(cmd.String())) cmd.Stdout = &stdout - if logger.MaxLogging() { - cmd.Stderr = os.Stderr - } else { - cmd.Stderr = &stderr - } + cmd.Stderr = &stderr if len(in) != 0 { cmd.Stdin = bytes.NewReader(in) } @@ -96,6 +92,9 @@ func execute(ctx context.Context, cli string, in []byte, args ...string) (stdout err = cmd.Run() if err != nil { if stderr.Len() > 0 { + if logger.MaxLogging() { + logger.V(logger.MaxLoggingLevel()).Info(cli, "stderr", stderr.String()) + } return stdout, errors.New(stderr.String()) } else { if !logger.MaxLogging() { diff --git a/pkg/executables/kubectl.go b/pkg/executables/kubectl.go index 1b232aa68d83..f3c07d0db8d0 100644 --- a/pkg/executables/kubectl.go +++ b/pkg/executables/kubectl.go @@ -262,6 +262,10 @@ func (k *Kubectl) WaitForControlPlaneReady(ctx context.Context, cluster *types.C return k.Wait(ctx, cluster.KubeconfigFile, timeout, "ControlPlaneReady", fmt.Sprintf("%s/%s", capiClustersResourceType, newClusterName), constants.EksaSystemNamespace) } +func (k *Kubectl) WaitForControlPlaneNotReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error { + return k.Wait(ctx, cluster.KubeconfigFile, timeout, "ControlPlaneReady=false", fmt.Sprintf("%s/%s", capiClustersResourceType, newClusterName), constants.EksaSystemNamespace) +} + func (k *Kubectl) WaitForManagedExternalEtcdReady(ctx context.Context, cluster *types.Cluster, timeout string, newClusterName string) error { return k.Wait(ctx, cluster.KubeconfigFile, timeout, "ManagedEtcdReady", fmt.Sprintf("clusters.%s/%s", clusterv1.GroupVersion.Group, newClusterName), constants.EksaSystemNamespace) } diff --git a/pkg/logger/logger.go b/pkg/logger/logger.go index 577fe422da72..714185f4643a 100644 --- a/pkg/logger/logger.go +++ b/pkg/logger/logger.go @@ -35,6 +35,10 @@ func MaxLogging() bool { return l.V(maxLogging).Enabled() } +func MaxLoggingLevel() int { + return maxLogging +} + // Fatal is equivalent to Get().Error() followed by a call to os.Exit(1). func Fatal(err error, msg string) { l.Error(err, msg)