Skip to content

Commit

Permalink
Add flag to set machine health check timeout (aws#3918) (aws#4123)
Browse files Browse the repository at this point in the history
Before this change, an unhealthy machine health check would timeout
after five minutes. This leads to an endless loop for a RolingUpgrade
on a slow system. Disabling the health checks circumvents this issue,
but is not ideal. This change adds a --unhealthy-machine-timeout flag
which customers can use to make the upgrade process more reliable
without sacrificing health checks.
  • Loading branch information
msanjaq authored and panktishah26 committed Nov 23, 2022
1 parent b5bca1a commit 1affe1f
Show file tree
Hide file tree
Showing 6 changed files with 107 additions and 50 deletions.
1 change: 1 addition & 0 deletions cmd/eksctl-anywhere/cmd/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const (
cpWaitTimeoutFlag = "control-plane-wait-timeout"
externalEtcdWaitTimeoutFlag = "external-etcd-wait-timeout"
perMachineWaitTimeoutFlag = "per-machine-wait-timeout"
unhealthyMachineTimeoutFlag = "unhealthy-machine-timeout"
)

type Operation int
Expand Down
10 changes: 10 additions & 0 deletions cmd/eksctl-anywhere/cmd/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type timeoutOptions struct {
cpWaitTimeout string
externalEtcdWaitTimeout string
perMachineWaitTimeout string
unhealthyMachineTimeout string
}

func applyTimeoutFlags(flagSet *pflag.FlagSet, t *timeoutOptions) {
Expand All @@ -37,6 +38,9 @@ func applyTimeoutFlags(flagSet *pflag.FlagSet, t *timeoutOptions) {

flagSet.StringVar(&t.perMachineWaitTimeout, perMachineWaitTimeoutFlag, clustermanager.DefaultMaxWaitPerMachine.String(), "Override the default machine wait timeout (10m)/per machine ")
markFlagHidden(flagSet, perMachineWaitTimeoutFlag)

flagSet.StringVar(&t.unhealthyMachineTimeout, unhealthyMachineTimeoutFlag, clustermanager.DefaultUnhealthyMachineTimeout.String(), "Override the default unhealthy machine timeout (5m) ")
markFlagHidden(flagSet, unhealthyMachineTimeoutFlag)
}

func buildClusterManagerOpts(t timeoutOptions) ([]clustermanager.ClusterManagerOpt, error) {
Expand All @@ -55,10 +59,16 @@ func buildClusterManagerOpts(t timeoutOptions) ([]clustermanager.ClusterManagerO
return nil, fmt.Errorf(timeoutErrorTemplate, perMachineWaitTimeoutFlag, err)
}

unhealthyMachineTimeout, err := time.ParseDuration(t.unhealthyMachineTimeout)
if err != nil {
return nil, fmt.Errorf(timeoutErrorTemplate, unhealthyMachineTimeoutFlag, err)
}

return []clustermanager.ClusterManagerOpt{
clustermanager.WithControlPlaneWaitTimeout(cpWaitTimeout),
clustermanager.WithExternalEtcdWaitTimeout(externalEtcdWaitTimeout),
clustermanager.WithMachineMaxWait(perMachineWaitTimeout),
clustermanager.WithUnhealthyMachineTimeout(unhealthyMachineTimeout),
}, nil
}

Expand Down
33 changes: 17 additions & 16 deletions pkg/clusterapi/machine_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@ import (
)

const (
unhealthyConditionTimeout = 5 * time.Minute
machineHealthCheckKind = "MachineHealthCheck"
maxUnhealthyControlPlane = "100%"
maxUnhealthyWorker = "40%"
machineHealthCheckKind = "MachineHealthCheck"
maxUnhealthyControlPlane = "100%"
maxUnhealthyWorker = "40%"
)

func machineHealthCheck(clusterName string) *clusterv1.MachineHealthCheck {
func machineHealthCheck(clusterName string, unhealthyTimeout time.Duration) *clusterv1.MachineHealthCheck {
return &clusterv1.MachineHealthCheck{
TypeMeta: metav1.TypeMeta{
APIVersion: clusterAPIVersion,
Expand All @@ -40,38 +39,40 @@ func machineHealthCheck(clusterName string) *clusterv1.MachineHealthCheck {
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
Timeout: metav1.Duration{Duration: unhealthyConditionTimeout},
Timeout: metav1.Duration{Duration: unhealthyTimeout},
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
Timeout: metav1.Duration{Duration: unhealthyConditionTimeout},
Timeout: metav1.Duration{Duration: unhealthyTimeout},
},
},
},
}
}

func MachineHealthCheckForControlPlane(clusterSpec *cluster.Spec) *clusterv1.MachineHealthCheck {
mhc := machineHealthCheck(ClusterName(clusterSpec.Cluster))
// MachineHealthCheckForControlPlane creates MachineHealthCheck resources for the control plane.
func MachineHealthCheckForControlPlane(clusterSpec *cluster.Spec, unhealthyTimeout time.Duration) *clusterv1.MachineHealthCheck {
mhc := machineHealthCheck(ClusterName(clusterSpec.Cluster), unhealthyTimeout)
mhc.SetName(ControlPlaneMachineHealthCheckName(clusterSpec))
mhc.Spec.Selector.MatchLabels[clusterv1.MachineControlPlaneLabelName] = ""
maxUnhealthy := intstr.Parse(maxUnhealthyControlPlane)
mhc.Spec.MaxUnhealthy = &maxUnhealthy
return mhc
}

func MachineHealthCheckForWorkers(clusterSpec *cluster.Spec) []*clusterv1.MachineHealthCheck {
// MachineHealthCheckForWorkers creates MachineHealthCheck resources for the workers.
func MachineHealthCheckForWorkers(clusterSpec *cluster.Spec, unhealthyTimeout time.Duration) []*clusterv1.MachineHealthCheck {
m := make([]*clusterv1.MachineHealthCheck, 0, len(clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations))
for _, workerNodeGroupConfig := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations {
mhc := machineHealthCheckForWorker(clusterSpec, workerNodeGroupConfig)
mhc := machineHealthCheckForWorker(clusterSpec, workerNodeGroupConfig, unhealthyTimeout)
m = append(m, mhc)
}
return m
}

func machineHealthCheckForWorker(clusterSpec *cluster.Spec, workerNodeGroupConfig v1alpha1.WorkerNodeGroupConfiguration) *clusterv1.MachineHealthCheck {
mhc := machineHealthCheck(ClusterName(clusterSpec.Cluster))
func machineHealthCheckForWorker(clusterSpec *cluster.Spec, workerNodeGroupConfig v1alpha1.WorkerNodeGroupConfiguration, unhealthyTimeout time.Duration) *clusterv1.MachineHealthCheck {
mhc := machineHealthCheck(ClusterName(clusterSpec.Cluster), unhealthyTimeout)
mhc.SetName(WorkerMachineHealthCheckName(clusterSpec, workerNodeGroupConfig))
mhc.Spec.Selector.MatchLabels[clusterv1.MachineDeploymentLabelName] = MachineDeploymentName(clusterSpec, workerNodeGroupConfig)
maxUnhealthy := intstr.Parse(maxUnhealthyWorker)
Expand All @@ -80,11 +81,11 @@ func machineHealthCheckForWorker(clusterSpec *cluster.Spec, workerNodeGroupConfi
}

// MachineHealthCheckObjects creates MachineHealthCheck resources for control plane and all the worker node groups.
func MachineHealthCheckObjects(clusterSpec *cluster.Spec) []runtime.Object {
mhcWorkers := MachineHealthCheckForWorkers(clusterSpec)
func MachineHealthCheckObjects(clusterSpec *cluster.Spec, unhealthyTimeout time.Duration) []runtime.Object {
mhcWorkers := MachineHealthCheckForWorkers(clusterSpec, unhealthyTimeout)
o := make([]runtime.Object, 0, len(mhcWorkers)+1)
for _, item := range mhcWorkers {
o = append(o, item)
}
return append(o, MachineHealthCheckForControlPlane(clusterSpec))
return append(o, MachineHealthCheckForControlPlane(clusterSpec, unhealthyTimeout))
}
48 changes: 31 additions & 17 deletions pkg/clusterapi/machine_health_check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,18 @@ import (
)

func TestMachineHealthCheckForControlPlane(t *testing.T) {
tt := newApiBuilerTest(t)
timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second}
for _, timeout := range timeouts {
tt := newApiBuilerTest(t)
want := expectedMachineHealthCheckForControlPlane(timeout)
got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec, timeout)
tt.Expect(got).To(Equal(want))
}
}

func expectedMachineHealthCheckForControlPlane(timeout time.Duration) *clusterv1.MachineHealthCheck {
maxUnhealthy := intstr.Parse("100%")
want := &clusterv1.MachineHealthCheck{
return &clusterv1.MachineHealthCheck{
TypeMeta: metav1.TypeMeta{
APIVersion: "cluster.x-k8s.io/v1beta1",
Kind: "MachineHealthCheck",
Expand All @@ -40,25 +49,32 @@ func TestMachineHealthCheckForControlPlane(t *testing.T) {
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeout},
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeout},
},
},
},
}
got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec)
tt.Expect(got).To(Equal(want))
}

func TestMachineHealthCheckForWorkers(t *testing.T) {
tt := newApiBuilerTest(t)
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second}
for _, timeout := range timeouts {
tt := newApiBuilerTest(t)
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
want := expectedMachineHealthCheckForWorkers(timeout)
got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec, timeout)
tt.Expect(got).To(Equal(want))
}
}

func expectedMachineHealthCheckForWorkers(timeout time.Duration) []*clusterv1.MachineHealthCheck {
maxUnhealthy := intstr.Parse("40%")
want := []*clusterv1.MachineHealthCheck{
return []*clusterv1.MachineHealthCheck{
{
TypeMeta: metav1.TypeMeta{
APIVersion: "cluster.x-k8s.io/v1beta1",
Expand All @@ -80,29 +96,27 @@ func TestMachineHealthCheckForWorkers(t *testing.T) {
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeout},
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeout},
},
},
},
},
}

got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec)
tt.Expect(got).To(Equal(want))
}

func TestMachineHealthCheckObjects(t *testing.T) {
tt := newApiBuilerTest(t)
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
timeout := 5 * time.Minute

wantWN := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec)
wantCP := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec)
wantWN := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec, timeout)
wantCP := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec, timeout)

got := clusterapi.MachineHealthCheckObjects(tt.clusterSpec)
got := clusterapi.MachineHealthCheckObjects(tt.clusterSpec, timeout)
tt.Expect(got).To(Equal([]runtime.Object{wantWN[0], wantCP}))
}
32 changes: 23 additions & 9 deletions pkg/clustermanager/cluster_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,23 @@ import (
)

const (
maxRetries = 30
defaultBackOffPeriod = 5 * time.Second
machineBackoff = 1 * time.Second
defaultMachinesMinWait = 30 * time.Minute
moveCAPIWait = 15 * time.Minute
DefaultMaxWaitPerMachine = 10 * time.Minute
clusterWaitStr = "60m"
maxRetries = 30
defaultBackOffPeriod = 5 * time.Second
machineBackoff = 1 * time.Second
defaultMachinesMinWait = 30 * time.Minute
moveCAPIWait = 15 * time.Minute
// DefaultMaxWaitPerMachine is the default max time the cluster manager will wait per a machine.
DefaultMaxWaitPerMachine = 10 * time.Minute
clusterWaitStr = "60m"
// DefaultControlPlaneWait is the default time the cluster manager will wait for the control plane to be ready.
DefaultControlPlaneWait = 60 * time.Minute
deploymentWaitStr = "30m"
controlPlaneInProgressStr = "1m"
etcdInProgressStr = "1m"
DefaultEtcdWait = 60 * time.Minute
// DefaultEtcdWait is the default time the cluster manager will wait for ectd to be ready.
DefaultEtcdWait = 60 * time.Minute
// DefaultUnhealthyMachineTimeout is the default timeout for an unhealthy machine health check.
DefaultUnhealthyMachineTimeout = 5 * time.Minute
)

var eksaClusterResourceType = fmt.Sprintf("clusters.%s", v1alpha1.GroupVersion.Group)
Expand All @@ -68,6 +73,7 @@ type ClusterManager struct {
awsIamAuth AwsIamAuth
controlPlaneWaitTimeout time.Duration
externalEtcdWaitTimeout time.Duration
unhealthyMachineTimeout time.Duration
}

type ClusterClient interface {
Expand Down Expand Up @@ -149,6 +155,7 @@ func New(clusterClient ClusterClient, networking Networking, writer filewriter.F
awsIamAuth: awsIamAuth,
controlPlaneWaitTimeout: DefaultControlPlaneWait,
externalEtcdWaitTimeout: DefaultEtcdWait,
unhealthyMachineTimeout: DefaultUnhealthyMachineTimeout,
}

for _, o := range opts {
Expand Down Expand Up @@ -188,6 +195,13 @@ func WithMachineMinWait(machineMinWait time.Duration) ClusterManagerOpt {
}
}

// WithUnhealthyMachineTimeout sets the timeout of an unhealthy machine health check.
func WithUnhealthyMachineTimeout(timeout time.Duration) ClusterManagerOpt {
return func(c *ClusterManager) {
c.unhealthyMachineTimeout = timeout
}
}

func WithRetrier(retrier *retrier.Retrier) ClusterManagerOpt {
return func(c *ClusterManager) {
c.clusterClient.Retrier = retrier
Expand Down Expand Up @@ -663,7 +677,7 @@ func (c *ClusterManager) InstallStorageClass(ctx context.Context, cluster *types
}

func (c *ClusterManager) InstallMachineHealthChecks(ctx context.Context, clusterSpec *cluster.Spec, workloadCluster *types.Cluster) error {
mhc, err := templater.ObjectsToYaml(clusterapi.MachineHealthCheckObjects(clusterSpec)...)
mhc, err := templater.ObjectsToYaml(clusterapi.MachineHealthCheckObjects(clusterSpec, c.unhealthyMachineTimeout)...)
if err != nil {
return err
}
Expand Down
33 changes: 25 additions & 8 deletions pkg/clustermanager/cluster_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1306,24 +1306,25 @@ func TestClusterManagerCreateEKSAResourcesFailure(t *testing.T) {
tt.Expect(c.CreateEKSAResources(ctx, tt.cluster, tt.clusterSpec, datacenterConfig, machineConfigs)).NotTo(Succeed())
}

var wantMHC = []byte(`apiVersion: cluster.x-k8s.io/v1beta1
func expectedMachineHealthCheck(timeout time.Duration) []byte {
healthCheck := fmt.Sprintf(`apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineHealthCheck
metadata:
creationTimestamp: null
name: fluxTestCluster-worker-1-worker-unhealthy
namespace: eksa-system
spec:
clusterName: fluxTestCluster
maxUnhealthy: 40%
maxUnhealthy: 40%%
selector:
matchLabels:
cluster.x-k8s.io/deployment-name: fluxTestCluster-worker-1
unhealthyConditions:
- status: Unknown
timeout: 5m0s
timeout: %[1]s
type: Ready
- status: "False"
timeout: 5m0s
timeout: %[1]s
type: Ready
status:
currentHealthy: 0
Expand All @@ -1339,29 +1340,44 @@ metadata:
namespace: eksa-system
spec:
clusterName: fluxTestCluster
maxUnhealthy: 100%
maxUnhealthy: 100%%
selector:
matchLabels:
cluster.x-k8s.io/control-plane: ""
unhealthyConditions:
- status: Unknown
timeout: 5m0s
timeout: %[1]s
type: Ready
- status: "False"
timeout: 5m0s
timeout: %[1]s
type: Ready
status:
currentHealthy: 0
expectedMachines: 0
remediationsAllowed: 0
---
`)
`, timeout)
return []byte(healthCheck)
}

func TestInstallMachineHealthChecks(t *testing.T) {
ctx := context.Background()
tt := newTest(t)
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
wantMHC := expectedMachineHealthCheck(clustermanager.DefaultUnhealthyMachineTimeout)
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC)

if err := tt.clusterManager.InstallMachineHealthChecks(ctx, tt.clusterSpec, tt.cluster); err != nil {
t.Errorf("ClusterManager.InstallMachineHealthChecks() error = %v, wantErr nil", err)
}
}

func TestInstallMachineHealthChecksWithTimeoutOverride(t *testing.T) {
ctx := context.Background()
tt := newTest(t, clustermanager.WithUnhealthyMachineTimeout(30*time.Minute))
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
wantMHC := expectedMachineHealthCheck(30 * time.Minute)
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC)

if err := tt.clusterManager.InstallMachineHealthChecks(ctx, tt.clusterSpec, tt.cluster); err != nil {
Expand All @@ -1374,6 +1390,7 @@ func TestInstallMachineHealthChecksApplyError(t *testing.T) {
tt := newTest(t, clustermanager.WithRetrier(retrier.NewWithMaxRetries(1, 0)))
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
tt.clusterManager.Retrier = retrier.NewWithMaxRetries(2, 1*time.Microsecond)
wantMHC := expectedMachineHealthCheck(clustermanager.DefaultUnhealthyMachineTimeout)
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC).Return(errors.New("apply error")).MaxTimes(2)

if err := tt.clusterManager.InstallMachineHealthChecks(ctx, tt.clusterSpec, tt.cluster); err == nil {
Expand Down

0 comments on commit 1affe1f

Please sign in to comment.