From f1e9ae5c5ac4bb31e4778530995bd78c7a33a64e Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Sun, 9 May 2021 09:24:00 +0300 Subject: [PATCH 1/3] functional changes: PR implementation + Unit Tests Signed-off-by: Michael Shitrit --- ...pi-operator_07_machinehealthcheck.crd.yaml | 10 ++ .../v1beta1/machinehealthcheck_types.go | 11 ++ .../machine/v1beta1/zz_generated.deepcopy.go | 1 + .../machinehealthcheck_controller.go | 15 +- .../machinehealthcheck_controller_test.go | 137 ++++++++++++++++-- 5 files changed, 161 insertions(+), 13 deletions(-) diff --git a/install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml b/install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml index 1941c75e28..fc0da7cef2 100644 --- a/install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml +++ b/install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml @@ -55,6 +55,16 @@ spec: spec: description: Specification of machine health check policy properties: + failedNodeStartupTimeout: + default: 48h + description: Failed Machines that are older than this value and are + without a nodeRef or a providerID will be considered to have passed + the time period allocated for a manual fix and will be remediated. + Expects an unsigned duration string of decimal numbers each with + optional fraction and a unit suffix, eg "300ms", "1.5h" or "2h45m". + Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". + pattern: ^([0-9]+(\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$ + type: string maxUnhealthy: anyOf: - type: integer diff --git a/pkg/apis/machine/v1beta1/machinehealthcheck_types.go b/pkg/apis/machine/v1beta1/machinehealthcheck_types.go index 4dd2b7a568..b66fab6d0c 100644 --- a/pkg/apis/machine/v1beta1/machinehealthcheck_types.go +++ b/pkg/apis/machine/v1beta1/machinehealthcheck_types.go @@ -83,6 +83,17 @@ type MachineHealthCheckSpec struct { // +kubebuilder:validation:Type:=string NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"` + // Failed Machines that are older than this value and are without a nodeRef or a providerID + // will be considered to have passed the time period allocated for a manual fix and will be remediated. + // Expects an unsigned duration string of decimal numbers each with optional + // fraction and a unit suffix, eg "300ms", "1.5h" or "2h45m". + // Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". + // +optional + // +kubebuilder:default:="48h" + // +kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$" + // +kubebuilder:validation:Type:=string + FailedNodeStartupTimeout metav1.Duration `json:"failedNodeStartupTimeout,omitempty"` + // RemediationTemplate is a reference to a remediation template // provided by an infrastructure provider. // diff --git a/pkg/apis/machine/v1beta1/zz_generated.deepcopy.go b/pkg/apis/machine/v1beta1/zz_generated.deepcopy.go index abbd90b22b..b93edb2f7d 100644 --- a/pkg/apis/machine/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/machine/v1beta1/zz_generated.deepcopy.go @@ -204,6 +204,7 @@ func (in *MachineHealthCheckSpec) DeepCopyInto(out *MachineHealthCheckSpec) { *out = new(v1.Duration) **out = **in } + out.FailedNodeStartupTimeout = in.FailedNodeStartupTimeout if in.RemediationTemplate != nil { in, out := &in.RemediationTemplate, &out.RemediationTemplate *out = new(corev1.ObjectReference) diff --git a/pkg/controller/machinehealthcheck/machinehealthcheck_controller.go b/pkg/controller/machinehealthcheck/machinehealthcheck_controller.go index b11ad1ed87..3e496270ed 100644 --- a/pkg/controller/machinehealthcheck/machinehealthcheck_controller.go +++ b/pkg/controller/machinehealthcheck/machinehealthcheck_controller.go @@ -191,7 +191,7 @@ func (r *ReconcileMachineHealthCheck) Reconcile(ctx context.Context, request rec } // health check all targets and reconcile mhc status - currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets, nodeStartupTimeout.Duration) + currentHealthy, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(targets, nodeStartupTimeout.Duration, mhc.Spec.FailedNodeStartupTimeout.Duration) healthyCount := len(currentHealthy) mhc.Status.CurrentHealthy = &healthyCount mhc.Status.ExpectedMachines = &totalTargets @@ -448,13 +448,13 @@ func (r *ReconcileMachineHealthCheck) reconcileStatus(baseToPatch client.Patch, // healthCheckTargets health checks a slice of targets // and gives a data to measure the average health -func (r *ReconcileMachineHealthCheck) healthCheckTargets(targets []target, timeoutForMachineToHaveNode time.Duration) ([]target, []target, []time.Duration, []error) { +func (r *ReconcileMachineHealthCheck) healthCheckTargets(targets []target, timeoutForMachineToHaveNode time.Duration, failedNodeStartupTimeout time.Duration) ([]target, []target, []time.Duration, []error) { var errList []error var needRemediationTargets, currentHealthy []target var nextCheckTimes []time.Duration for _, t := range targets { klog.V(3).Infof("Reconciling %s: health checking", t.string()) - needsRemediation, nextCheck, err := t.needsRemediation(timeoutForMachineToHaveNode) + needsRemediation, nextCheck, err := t.needsRemediation(timeoutForMachineToHaveNode, failedNodeStartupTimeout) if err != nil { klog.Errorf("Reconciling %s: error health checking: %v", t.string(), err) errList = append(errList, err) @@ -736,12 +736,19 @@ func (t *target) nodeName() string { return "" } -func (t *target) needsRemediation(timeoutForMachineToHaveNode time.Duration) (bool, time.Duration, error) { +func (t *target) needsRemediation(timeoutForMachineToHaveNode time.Duration, failedNodeStartupTimeout time.Duration) (bool, time.Duration, error) { var nextCheckTimes []time.Duration now := time.Now() // machine has failed if derefStringPointer(t.Machine.Status.Phase) == machinePhaseFailed { + // A machine that fails without getting a NodeRef of ProviderID means that there was a fatal configuration error. + // Typically remediating this machine will not actually resolve the issue. + // Delay remediation for this machine to allow manual intervention to resolve the configuration issue. + if (t.Machine.Status.NodeRef == nil || t.Machine.Spec.ProviderID == nil) && now.Sub(t.Machine.CreationTimestamp.Time) <= failedNodeStartupTimeout { + klog.Warningf("Machine %q is in failed phase, remediation is skipped to allow manual intervention", t.Machine.Name) + return false, time.Duration(0), nil + } klog.V(3).Infof("%s: unhealthy: machine phase is %q", t.string(), machinePhaseFailed) return true, time.Duration(0), nil } diff --git a/pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go b/pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go index aaf0ae3d40..714c640011 100644 --- a/pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go +++ b/pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go @@ -221,12 +221,26 @@ func TestReconcile(t *testing.T) { machineHealthCheck := maotesting.NewMachineHealthCheck("machineHealthCheck") nodeStartupTimeout := 15 * time.Minute machineHealthCheck.Spec.NodeStartupTimeout = &metav1.Duration{Duration: nodeStartupTimeout} + machineHealthCheck.Spec.FailedNodeStartupTimeout = metav1.Duration{Duration: time.Hour} machineHealthCheckNegativeMaxUnhealthy := maotesting.NewMachineHealthCheck("machineHealthCheckNegativeMaxUnhealthy") negativeOne := intstr.FromInt(-1) machineHealthCheckNegativeMaxUnhealthy.Spec.MaxUnhealthy = &negativeOne machineHealthCheckNegativeMaxUnhealthy.Spec.NodeStartupTimeout = &metav1.Duration{Duration: nodeStartupTimeout} + // Failed Machine with no node + failedVar := machinePhaseFailed + failedMachineWithoutNode := maotesting.NewMachine("failedMachineWithoutNode", "") + failedMachineWithoutNode.Status.Phase = &failedVar + failedMachineWithoutNode.CreationTimestamp = metav1.Now() + + // Failed Machine with node + failedMachineWithNodeAndProviderId := maotesting.NewMachine("failedMachineWithoutNode", "recentlyUnhealthy") + failedMachineWithNodeAndProviderId.Status.Phase = &failedVar + mockProviderId := "mockProviderId" + failedMachineWithNodeAndProviderId.Spec.ProviderID = &mockProviderId + failedMachineWithNodeAndProviderId.CreationTimestamp = metav1.Now() + // remediationExternal nodeUnhealthyForTooLong := maotesting.NewNode("nodeUnhealthyForTooLong", false) nodeUnhealthyForTooLong.Annotations = map[string]string{ @@ -418,6 +432,44 @@ func TestReconcile(t *testing.T) { }, }, }, + { + name: "failed machine with no node", + machine: failedMachineWithoutNode, + node: nodeAlreadyDeleted, + mhc: machineHealthCheck, + expected: expectedReconcile{ + result: reconcile.Result{}, + error: false, + }, + expectedEvents: []string{}, + expectedStatus: &mapiv1beta1.MachineHealthCheckStatus{ + ExpectedMachines: IntPtr(1), + CurrentHealthy: IntPtr(0), + RemediationsAllowed: 0, + Conditions: mapiv1beta1.Conditions{ + remediationAllowedCondition, + }, + }, + }, + { + name: "failed machine with node", + machine: failedMachineWithNodeAndProviderId, + node: nodeRecentlyUnhealthy, + mhc: machineHealthCheck, + expected: expectedReconcile{ + result: reconcile.Result{}, + error: false, + }, + expectedEvents: []string{EventMachineDeleted}, + expectedStatus: &mapiv1beta1.MachineHealthCheckStatus{ + ExpectedMachines: IntPtr(1), + CurrentHealthy: IntPtr(0), + RemediationsAllowed: 0, + Conditions: mapiv1beta1.Conditions{ + remediationAllowedCondition, + }, + }, + }, { name: "machine unhealthy with MHC negative maxUnhealthy", machine: machineUnhealthyForTooLong, @@ -1498,6 +1550,7 @@ func TestGetNodeFromMachine(t *testing.T) { func TestNeedsRemediation(t *testing.T) { knownDate := metav1.Time{Time: time.Date(1985, 06, 03, 0, 0, 0, 0, time.Local)} + providerID := "mockProviderId" machineFailed := machinePhaseFailed testCases := []struct { testCase string @@ -1506,6 +1559,7 @@ func TestNeedsRemediation(t *testing.T) { expectedNeedsRemediation bool expectedNextCheck time.Duration expectedError bool + failedNodeStartupTimeout time.Duration }{ { testCase: "healthy: does not met conditions criteria", @@ -1791,18 +1845,21 @@ func TestNeedsRemediation(t *testing.T) { expectedError: false, }, { - testCase: "unhealthy: machine phase failed", + testCase: "unhealthy: machine phase failed without node", target: &target{ Machine: mapiv1beta1.Machine{ TypeMeta: metav1.TypeMeta{Kind: "Machine"}, ObjectMeta: metav1.ObjectMeta{ - Annotations: make(map[string]string), - Name: "machine", - Namespace: namespace, - Labels: map[string]string{"foo": "bar"}, - OwnerReferences: []metav1.OwnerReference{{Kind: "MachineSet"}}, + Annotations: make(map[string]string), + Name: "machine", + Namespace: namespace, + Labels: map[string]string{"foo": "bar"}, + OwnerReferences: []metav1.OwnerReference{{Kind: "MachineSet"}}, + CreationTimestamp: metav1.Now(), + }, + Spec: mapiv1beta1.MachineSpec{ + ProviderID: &providerID, }, - Spec: mapiv1beta1.MachineSpec{}, Status: mapiv1beta1.MachineStatus{ Phase: &machineFailed, }, @@ -1839,9 +1896,71 @@ func TestNeedsRemediation(t *testing.T) { }, }, timeoutForMachineToHaveNode: defaultNodeStartupTimeout, + expectedNeedsRemediation: false, + expectedNextCheck: time.Duration(0), + expectedError: false, + failedNodeStartupTimeout: time.Minute, + }, + { + testCase: "unhealthy: machine phase failed with node", + target: &target{ + Machine: mapiv1beta1.Machine{ + TypeMeta: metav1.TypeMeta{Kind: "Machine"}, + ObjectMeta: metav1.ObjectMeta{ + Annotations: make(map[string]string), + Name: "machine", + Namespace: namespace, + Labels: map[string]string{"foo": "bar"}, + OwnerReferences: []metav1.OwnerReference{{Kind: "MachineSet"}}, + CreationTimestamp: metav1.Now(), + }, + Spec: mapiv1beta1.MachineSpec{ + ProviderID: &providerID, + }, + Status: mapiv1beta1.MachineStatus{ + Phase: &machineFailed, + NodeRef: &corev1.ObjectReference{ + Name: "nodeUnhealthy", + Namespace: metav1.NamespaceNone, + }, + }, + }, + Node: maotesting.NewNode("nodeUnhealthy", false), + MHC: mapiv1beta1.MachineHealthCheck{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + Namespace: namespace, + }, + TypeMeta: metav1.TypeMeta{ + Kind: "MachineHealthCheck", + }, + Spec: mapiv1beta1.MachineHealthCheckSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "foo": "bar", + }, + }, + UnhealthyConditions: []mapiv1beta1.UnhealthyCondition{ + { + Type: "Ready", + Status: "Unknown", + Timeout: metav1.Duration{Duration: 300 * time.Second}, + }, + { + Type: "Ready", + Status: "False", + Timeout: metav1.Duration{Duration: 300 * time.Second}, + }, + }, + }, + Status: mapiv1beta1.MachineHealthCheckStatus{}, + }, + }, + timeoutForMachineToHaveNode: defaultNodeStartupTimeout, expectedNeedsRemediation: true, expectedNextCheck: time.Duration(0), expectedError: false, + failedNodeStartupTimeout: time.Minute, }, { testCase: "healthy: meet conditions criteria but timeout", @@ -1922,7 +2041,7 @@ func TestNeedsRemediation(t *testing.T) { for _, tc := range testCases { t.Run(tc.testCase, func(t *testing.T) { - needsRemediation, nextCheck, err := tc.target.needsRemediation(tc.timeoutForMachineToHaveNode) + needsRemediation, nextCheck, err := tc.target.needsRemediation(tc.timeoutForMachineToHaveNode, tc.failedNodeStartupTimeout) if needsRemediation != tc.expectedNeedsRemediation { t.Errorf("Case: %v. Got: %v, expected: %v", tc.testCase, needsRemediation, tc.expectedNeedsRemediation) } @@ -2603,7 +2722,7 @@ func TestHealthCheckTargets(t *testing.T) { recorder := record.NewFakeRecorder(2) r := newFakeReconcilerWithCustomRecorder(recorder) t.Run(tc.testCase, func(t *testing.T) { - currentHealhty, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(tc.targets, tc.timeoutForMachineToHaveNode) + currentHealhty, needRemediationTargets, nextCheckTimes, errList := r.healthCheckTargets(tc.targets, tc.timeoutForMachineToHaveNode, 0) if len(currentHealhty) != tc.currentHealthy { t.Errorf("Case: %v. Got: %v, expected: %v", tc.testCase, currentHealhty, tc.currentHealthy) } From e3d7784a0ac8aededcf47c107e357f7d895f4484 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Tue, 29 Jun 2021 11:07:57 +0300 Subject: [PATCH 2/3] Removing default value of FailedNodeStartupTimeout Signed-off-by: Michael Shitrit --- .../0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml | 1 - pkg/apis/machine/v1beta1/machinehealthcheck_types.go | 1 - 2 files changed, 2 deletions(-) diff --git a/install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml b/install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml index fc0da7cef2..f5b1174481 100644 --- a/install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml +++ b/install/0000_30_machine-api-operator_07_machinehealthcheck.crd.yaml @@ -56,7 +56,6 @@ spec: description: Specification of machine health check policy properties: failedNodeStartupTimeout: - default: 48h description: Failed Machines that are older than this value and are without a nodeRef or a providerID will be considered to have passed the time period allocated for a manual fix and will be remediated. diff --git a/pkg/apis/machine/v1beta1/machinehealthcheck_types.go b/pkg/apis/machine/v1beta1/machinehealthcheck_types.go index b66fab6d0c..d6e972419b 100644 --- a/pkg/apis/machine/v1beta1/machinehealthcheck_types.go +++ b/pkg/apis/machine/v1beta1/machinehealthcheck_types.go @@ -89,7 +89,6 @@ type MachineHealthCheckSpec struct { // fraction and a unit suffix, eg "300ms", "1.5h" or "2h45m". // Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h". // +optional - // +kubebuilder:default:="48h" // +kubebuilder:validation:Pattern="^([0-9]+(\\.[0-9]+)?(ns|us|µs|ms|s|m|h))+$" // +kubebuilder:validation:Type:=string FailedNodeStartupTimeout metav1.Duration `json:"failedNodeStartupTimeout,omitempty"` From 5fdbfdb89cd6972b23460b29ec4f674ab960702d Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Fri, 5 Nov 2021 13:02:07 +0200 Subject: [PATCH 3/3] Merge fixes Signed-off-by: Michael Shitrit --- .../machinehealthcheck_controller_test.go | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go b/pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go index bb78fac8eb..d864d6eb61 100644 --- a/pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go +++ b/pkg/controller/machinehealthcheck/machinehealthcheck_controller_test.go @@ -481,11 +481,11 @@ func TestReconcile(t *testing.T) { error: false, }, expectedEvents: []string{}, - expectedStatus: &mapiv1beta1.MachineHealthCheckStatus{ + expectedStatus: &machinev1.MachineHealthCheckStatus{ ExpectedMachines: IntPtr(1), CurrentHealthy: IntPtr(0), RemediationsAllowed: 0, - Conditions: mapiv1beta1.Conditions{ + Conditions: machinev1.Conditions{ remediationAllowedCondition, }, }, @@ -500,11 +500,11 @@ func TestReconcile(t *testing.T) { error: false, }, expectedEvents: []string{EventMachineDeleted}, - expectedStatus: &mapiv1beta1.MachineHealthCheckStatus{ + expectedStatus: &machinev1.MachineHealthCheckStatus{ ExpectedMachines: IntPtr(1), CurrentHealthy: IntPtr(0), RemediationsAllowed: 0, - Conditions: mapiv1beta1.Conditions{ + Conditions: machinev1.Conditions{ remediationAllowedCondition, }, }, @@ -1912,13 +1912,13 @@ func TestNeedsRemediation(t *testing.T) { TypeMeta: metav1.TypeMeta{ Kind: "MachineHealthCheck", }, - Spec: mapiv1beta1.MachineHealthCheckSpec{ + Spec: machinev1.MachineHealthCheckSpec{ Selector: metav1.LabelSelector{ MatchLabels: map[string]string{ "foo": "bar", }, }, - UnhealthyConditions: []mapiv1beta1.UnhealthyCondition{ + UnhealthyConditions: []machinev1.UnhealthyCondition{ { Type: "Ready", Status: "Unknown", @@ -1931,7 +1931,7 @@ func TestNeedsRemediation(t *testing.T) { }, }, }, - Status: mapiv1beta1.MachineHealthCheckStatus{}, + Status: machinev1.MachineHealthCheckStatus{}, }, }, timeoutForMachineToHaveNode: defaultNodeStartupTimeout, @@ -1943,7 +1943,7 @@ func TestNeedsRemediation(t *testing.T) { { testCase: "unhealthy: machine phase failed with node", target: &target{ - Machine: mapiv1beta1.Machine{ + Machine: machinev1.Machine{ TypeMeta: metav1.TypeMeta{Kind: "Machine"}, ObjectMeta: metav1.ObjectMeta{ Annotations: make(map[string]string), @@ -1953,10 +1953,10 @@ func TestNeedsRemediation(t *testing.T) { OwnerReferences: []metav1.OwnerReference{{Kind: "MachineSet"}}, CreationTimestamp: metav1.Now(), }, - Spec: mapiv1beta1.MachineSpec{ + Spec: machinev1.MachineSpec{ ProviderID: &providerID, }, - Status: mapiv1beta1.MachineStatus{ + Status: machinev1.MachineStatus{ Phase: &machineFailed, NodeRef: &corev1.ObjectReference{ Name: "nodeUnhealthy", @@ -1965,7 +1965,7 @@ func TestNeedsRemediation(t *testing.T) { }, }, Node: maotesting.NewNode("nodeUnhealthy", false), - MHC: mapiv1beta1.MachineHealthCheck{ + MHC: machinev1.MachineHealthCheck{ ObjectMeta: metav1.ObjectMeta{ Name: "test", Namespace: namespace,