Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add flag to set machine health check timeout (#3918) #4123

Merged
merged 1 commit into from
Nov 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/eksctl-anywhere/cmd/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ const (
cpWaitTimeoutFlag = "control-plane-wait-timeout"
externalEtcdWaitTimeoutFlag = "external-etcd-wait-timeout"
perMachineWaitTimeoutFlag = "per-machine-wait-timeout"
unhealthyMachineTimeoutFlag = "unhealthy-machine-timeout"
)

type Operation int
Expand Down
10 changes: 10 additions & 0 deletions cmd/eksctl-anywhere/cmd/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ type timeoutOptions struct {
cpWaitTimeout string
externalEtcdWaitTimeout string
perMachineWaitTimeout string
unhealthyMachineTimeout string
}

func applyTimeoutFlags(flagSet *pflag.FlagSet, t *timeoutOptions) {
Expand All @@ -37,6 +38,9 @@ func applyTimeoutFlags(flagSet *pflag.FlagSet, t *timeoutOptions) {

flagSet.StringVar(&t.perMachineWaitTimeout, perMachineWaitTimeoutFlag, clustermanager.DefaultMaxWaitPerMachine.String(), "Override the default machine wait timeout (10m)/per machine ")
markFlagHidden(flagSet, perMachineWaitTimeoutFlag)

flagSet.StringVar(&t.unhealthyMachineTimeout, unhealthyMachineTimeoutFlag, clustermanager.DefaultUnhealthyMachineTimeout.String(), "Override the default unhealthy machine timeout (5m) ")
markFlagHidden(flagSet, unhealthyMachineTimeoutFlag)
}

func buildClusterManagerOpts(t timeoutOptions) ([]clustermanager.ClusterManagerOpt, error) {
Expand All @@ -55,10 +59,16 @@ func buildClusterManagerOpts(t timeoutOptions) ([]clustermanager.ClusterManagerO
return nil, fmt.Errorf(timeoutErrorTemplate, perMachineWaitTimeoutFlag, err)
}

unhealthyMachineTimeout, err := time.ParseDuration(t.unhealthyMachineTimeout)
if err != nil {
return nil, fmt.Errorf(timeoutErrorTemplate, unhealthyMachineTimeoutFlag, err)
}

return []clustermanager.ClusterManagerOpt{
clustermanager.WithControlPlaneWaitTimeout(cpWaitTimeout),
clustermanager.WithExternalEtcdWaitTimeout(externalEtcdWaitTimeout),
clustermanager.WithMachineMaxWait(perMachineWaitTimeout),
clustermanager.WithUnhealthyMachineTimeout(unhealthyMachineTimeout),
}, nil
}

Expand Down
33 changes: 17 additions & 16 deletions pkg/clusterapi/machine_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@ import (
)

const (
unhealthyConditionTimeout = 5 * time.Minute
machineHealthCheckKind = "MachineHealthCheck"
maxUnhealthyControlPlane = "100%"
maxUnhealthyWorker = "40%"
machineHealthCheckKind = "MachineHealthCheck"
maxUnhealthyControlPlane = "100%"
maxUnhealthyWorker = "40%"
)

func machineHealthCheck(clusterName string) *clusterv1.MachineHealthCheck {
func machineHealthCheck(clusterName string, unhealthyTimeout time.Duration) *clusterv1.MachineHealthCheck {
return &clusterv1.MachineHealthCheck{
TypeMeta: metav1.TypeMeta{
APIVersion: clusterAPIVersion,
Expand All @@ -40,38 +39,40 @@ func machineHealthCheck(clusterName string) *clusterv1.MachineHealthCheck {
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
Timeout: metav1.Duration{Duration: unhealthyConditionTimeout},
Timeout: metav1.Duration{Duration: unhealthyTimeout},
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
Timeout: metav1.Duration{Duration: unhealthyConditionTimeout},
Timeout: metav1.Duration{Duration: unhealthyTimeout},
},
},
},
}
}

func MachineHealthCheckForControlPlane(clusterSpec *cluster.Spec) *clusterv1.MachineHealthCheck {
mhc := machineHealthCheck(ClusterName(clusterSpec.Cluster))
// MachineHealthCheckForControlPlane creates MachineHealthCheck resources for the control plane.
func MachineHealthCheckForControlPlane(clusterSpec *cluster.Spec, unhealthyTimeout time.Duration) *clusterv1.MachineHealthCheck {
mhc := machineHealthCheck(ClusterName(clusterSpec.Cluster), unhealthyTimeout)
mhc.SetName(ControlPlaneMachineHealthCheckName(clusterSpec))
mhc.Spec.Selector.MatchLabels[clusterv1.MachineControlPlaneLabelName] = ""
maxUnhealthy := intstr.Parse(maxUnhealthyControlPlane)
mhc.Spec.MaxUnhealthy = &maxUnhealthy
return mhc
}

func MachineHealthCheckForWorkers(clusterSpec *cluster.Spec) []*clusterv1.MachineHealthCheck {
// MachineHealthCheckForWorkers creates MachineHealthCheck resources for the workers.
func MachineHealthCheckForWorkers(clusterSpec *cluster.Spec, unhealthyTimeout time.Duration) []*clusterv1.MachineHealthCheck {
m := make([]*clusterv1.MachineHealthCheck, 0, len(clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations))
for _, workerNodeGroupConfig := range clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations {
mhc := machineHealthCheckForWorker(clusterSpec, workerNodeGroupConfig)
mhc := machineHealthCheckForWorker(clusterSpec, workerNodeGroupConfig, unhealthyTimeout)
m = append(m, mhc)
}
return m
}

func machineHealthCheckForWorker(clusterSpec *cluster.Spec, workerNodeGroupConfig v1alpha1.WorkerNodeGroupConfiguration) *clusterv1.MachineHealthCheck {
mhc := machineHealthCheck(ClusterName(clusterSpec.Cluster))
func machineHealthCheckForWorker(clusterSpec *cluster.Spec, workerNodeGroupConfig v1alpha1.WorkerNodeGroupConfiguration, unhealthyTimeout time.Duration) *clusterv1.MachineHealthCheck {
mhc := machineHealthCheck(ClusterName(clusterSpec.Cluster), unhealthyTimeout)
mhc.SetName(WorkerMachineHealthCheckName(clusterSpec, workerNodeGroupConfig))
mhc.Spec.Selector.MatchLabels[clusterv1.MachineDeploymentLabelName] = MachineDeploymentName(clusterSpec, workerNodeGroupConfig)
maxUnhealthy := intstr.Parse(maxUnhealthyWorker)
Expand All @@ -80,11 +81,11 @@ func machineHealthCheckForWorker(clusterSpec *cluster.Spec, workerNodeGroupConfi
}

// MachineHealthCheckObjects creates MachineHealthCheck resources for control plane and all the worker node groups.
func MachineHealthCheckObjects(clusterSpec *cluster.Spec) []runtime.Object {
mhcWorkers := MachineHealthCheckForWorkers(clusterSpec)
func MachineHealthCheckObjects(clusterSpec *cluster.Spec, unhealthyTimeout time.Duration) []runtime.Object {
mhcWorkers := MachineHealthCheckForWorkers(clusterSpec, unhealthyTimeout)
o := make([]runtime.Object, 0, len(mhcWorkers)+1)
for _, item := range mhcWorkers {
o = append(o, item)
}
return append(o, MachineHealthCheckForControlPlane(clusterSpec))
return append(o, MachineHealthCheckForControlPlane(clusterSpec, unhealthyTimeout))
}
48 changes: 31 additions & 17 deletions pkg/clusterapi/machine_health_check_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,18 @@ import (
)

func TestMachineHealthCheckForControlPlane(t *testing.T) {
tt := newApiBuilerTest(t)
timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second}
for _, timeout := range timeouts {
tt := newApiBuilerTest(t)
want := expectedMachineHealthCheckForControlPlane(timeout)
got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec, timeout)
tt.Expect(got).To(Equal(want))
}
}

func expectedMachineHealthCheckForControlPlane(timeout time.Duration) *clusterv1.MachineHealthCheck {
maxUnhealthy := intstr.Parse("100%")
want := &clusterv1.MachineHealthCheck{
return &clusterv1.MachineHealthCheck{
TypeMeta: metav1.TypeMeta{
APIVersion: "cluster.x-k8s.io/v1beta1",
Kind: "MachineHealthCheck",
Expand All @@ -40,25 +49,32 @@ func TestMachineHealthCheckForControlPlane(t *testing.T) {
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeout},
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeout},
},
},
},
}
got := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec)
tt.Expect(got).To(Equal(want))
}

func TestMachineHealthCheckForWorkers(t *testing.T) {
tt := newApiBuilerTest(t)
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
timeouts := []time.Duration{5 * time.Minute, time.Hour, 30 * time.Second}
for _, timeout := range timeouts {
tt := newApiBuilerTest(t)
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
want := expectedMachineHealthCheckForWorkers(timeout)
got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec, timeout)
tt.Expect(got).To(Equal(want))
}
}

func expectedMachineHealthCheckForWorkers(timeout time.Duration) []*clusterv1.MachineHealthCheck {
maxUnhealthy := intstr.Parse("40%")
want := []*clusterv1.MachineHealthCheck{
return []*clusterv1.MachineHealthCheck{
{
TypeMeta: metav1.TypeMeta{
APIVersion: "cluster.x-k8s.io/v1beta1",
Expand All @@ -80,29 +96,27 @@ func TestMachineHealthCheckForWorkers(t *testing.T) {
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeout},
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
Timeout: metav1.Duration{Duration: 5 * time.Minute},
Timeout: metav1.Duration{Duration: timeout},
},
},
},
},
}

got := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec)
tt.Expect(got).To(Equal(want))
}

func TestMachineHealthCheckObjects(t *testing.T) {
tt := newApiBuilerTest(t)
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{*tt.workerNodeGroupConfig}
timeout := 5 * time.Minute

wantWN := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec)
wantCP := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec)
wantWN := clusterapi.MachineHealthCheckForWorkers(tt.clusterSpec, timeout)
wantCP := clusterapi.MachineHealthCheckForControlPlane(tt.clusterSpec, timeout)

got := clusterapi.MachineHealthCheckObjects(tt.clusterSpec)
got := clusterapi.MachineHealthCheckObjects(tt.clusterSpec, timeout)
tt.Expect(got).To(Equal([]runtime.Object{wantWN[0], wantCP}))
}
32 changes: 23 additions & 9 deletions pkg/clustermanager/cluster_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,23 @@ import (
)

const (
maxRetries = 30
defaultBackOffPeriod = 5 * time.Second
machineBackoff = 1 * time.Second
defaultMachinesMinWait = 30 * time.Minute
moveCAPIWait = 15 * time.Minute
DefaultMaxWaitPerMachine = 10 * time.Minute
clusterWaitStr = "60m"
maxRetries = 30
defaultBackOffPeriod = 5 * time.Second
machineBackoff = 1 * time.Second
defaultMachinesMinWait = 30 * time.Minute
moveCAPIWait = 15 * time.Minute
// DefaultMaxWaitPerMachine is the default max time the cluster manager will wait per a machine.
DefaultMaxWaitPerMachine = 10 * time.Minute
clusterWaitStr = "60m"
// DefaultControlPlaneWait is the default time the cluster manager will wait for the control plane to be ready.
DefaultControlPlaneWait = 60 * time.Minute
deploymentWaitStr = "30m"
controlPlaneInProgressStr = "1m"
etcdInProgressStr = "1m"
DefaultEtcdWait = 60 * time.Minute
// DefaultEtcdWait is the default time the cluster manager will wait for ectd to be ready.
DefaultEtcdWait = 60 * time.Minute
// DefaultUnhealthyMachineTimeout is the default timeout for an unhealthy machine health check.
DefaultUnhealthyMachineTimeout = 5 * time.Minute
)

var eksaClusterResourceType = fmt.Sprintf("clusters.%s", v1alpha1.GroupVersion.Group)
Expand All @@ -65,6 +70,7 @@ type ClusterManager struct {
awsIamAuth AwsIamAuth
controlPlaneWaitTimeout time.Duration
externalEtcdWaitTimeout time.Duration
unhealthyMachineTimeout time.Duration
}

type ClusterClient interface {
Expand Down Expand Up @@ -145,6 +151,7 @@ func New(clusterClient ClusterClient, networking Networking, writer filewriter.F
awsIamAuth: awsIamAuth,
controlPlaneWaitTimeout: DefaultControlPlaneWait,
externalEtcdWaitTimeout: DefaultEtcdWait,
unhealthyMachineTimeout: DefaultUnhealthyMachineTimeout,
}

for _, o := range opts {
Expand Down Expand Up @@ -184,6 +191,13 @@ func WithMachineMinWait(machineMinWait time.Duration) ClusterManagerOpt {
}
}

// WithUnhealthyMachineTimeout sets the timeout of an unhealthy machine health check.
func WithUnhealthyMachineTimeout(timeout time.Duration) ClusterManagerOpt {
return func(c *ClusterManager) {
c.unhealthyMachineTimeout = timeout
}
}

func WithRetrier(retrier *retrier.Retrier) ClusterManagerOpt {
return func(c *ClusterManager) {
c.clusterClient.Retrier = retrier
Expand Down Expand Up @@ -605,7 +619,7 @@ func (c *ClusterManager) InstallStorageClass(ctx context.Context, cluster *types
}

func (c *ClusterManager) InstallMachineHealthChecks(ctx context.Context, clusterSpec *cluster.Spec, workloadCluster *types.Cluster) error {
mhc, err := templater.ObjectsToYaml(clusterapi.MachineHealthCheckObjects(clusterSpec)...)
mhc, err := templater.ObjectsToYaml(clusterapi.MachineHealthCheckObjects(clusterSpec, c.unhealthyMachineTimeout)...)
if err != nil {
return err
}
Expand Down
33 changes: 25 additions & 8 deletions pkg/clustermanager/cluster_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1306,24 +1306,25 @@ func TestClusterManagerCreateEKSAResourcesFailure(t *testing.T) {
tt.Expect(c.CreateEKSAResources(ctx, tt.cluster, tt.clusterSpec, datacenterConfig, machineConfigs)).NotTo(Succeed())
}

var wantMHC = []byte(`apiVersion: cluster.x-k8s.io/v1beta1
func expectedMachineHealthCheck(timeout time.Duration) []byte {
healthCheck := fmt.Sprintf(`apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineHealthCheck
metadata:
creationTimestamp: null
name: fluxTestCluster-worker-1-worker-unhealthy
namespace: eksa-system
spec:
clusterName: fluxTestCluster
maxUnhealthy: 40%
maxUnhealthy: 40%%
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a mistake?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this is intentional. The '%%' is converted to '%' by sprintf.

https://stackoverflow.com/questions/35681595/escape-variables-with-printf

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh so it wasn't validating before?

selector:
matchLabels:
cluster.x-k8s.io/deployment-name: fluxTestCluster-worker-1
unhealthyConditions:
- status: Unknown
timeout: 5m0s
timeout: %[1]s
type: Ready
- status: "False"
timeout: 5m0s
timeout: %[1]s
type: Ready
status:
currentHealthy: 0
Expand All @@ -1339,29 +1340,44 @@ metadata:
namespace: eksa-system
spec:
clusterName: fluxTestCluster
maxUnhealthy: 100%
maxUnhealthy: 100%%
selector:
matchLabels:
cluster.x-k8s.io/control-plane: ""
unhealthyConditions:
- status: Unknown
timeout: 5m0s
timeout: %[1]s
type: Ready
- status: "False"
timeout: 5m0s
timeout: %[1]s
type: Ready
status:
currentHealthy: 0
expectedMachines: 0
remediationsAllowed: 0

---
`)
`, timeout)
return []byte(healthCheck)
}

func TestInstallMachineHealthChecks(t *testing.T) {
ctx := context.Background()
tt := newTest(t)
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
wantMHC := expectedMachineHealthCheck(clustermanager.DefaultUnhealthyMachineTimeout)
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC)

if err := tt.clusterManager.InstallMachineHealthChecks(ctx, tt.clusterSpec, tt.cluster); err != nil {
t.Errorf("ClusterManager.InstallMachineHealthChecks() error = %v, wantErr nil", err)
}
}

func TestInstallMachineHealthChecksWithTimeoutOverride(t *testing.T) {
ctx := context.Background()
tt := newTest(t, clustermanager.WithUnhealthyMachineTimeout(30*time.Minute))
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
wantMHC := expectedMachineHealthCheck(30 * time.Minute)
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC)

if err := tt.clusterManager.InstallMachineHealthChecks(ctx, tt.clusterSpec, tt.cluster); err != nil {
Expand All @@ -1374,6 +1390,7 @@ func TestInstallMachineHealthChecksApplyError(t *testing.T) {
tt := newTest(t, clustermanager.WithRetrier(retrier.NewWithMaxRetries(1, 0)))
tt.clusterSpec.Cluster.Spec.WorkerNodeGroupConfigurations[0].Name = "worker-1"
tt.clusterManager.Retrier = retrier.NewWithMaxRetries(2, 1*time.Microsecond)
wantMHC := expectedMachineHealthCheck(clustermanager.DefaultUnhealthyMachineTimeout)
tt.mocks.client.EXPECT().ApplyKubeSpecFromBytes(ctx, tt.cluster, wantMHC).Return(errors.New("apply error")).MaxTimes(2)

if err := tt.clusterManager.InstallMachineHealthChecks(ctx, tt.clusterSpec, tt.cluster); err == nil {
Expand Down