From fa7fa311a65624b402de12e31300f40c3b69de86 Mon Sep 17 00:00:00 2001 From: Andrey Klimentyev Date: Wed, 22 Apr 2020 14:24:26 +0300 Subject: [PATCH] Skip drain on NotReady Nodes Signed-off-by: Andrey Klimentyev --- pkg/controller/drain.go | 24 +++++++- pkg/controller/machine.go | 1 + pkg/controller/machine_test.go | 100 +++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 1 deletion(-) diff --git a/pkg/controller/drain.go b/pkg/controller/drain.go index e29e08f8c..d0f6a7aa8 100644 --- a/pkg/controller/drain.go +++ b/pkg/controller/drain.go @@ -58,6 +58,7 @@ type DrainOptions struct { PvDetachTimeout time.Duration DeleteLocalData bool nodeName string + nodeNotReadyDuration time.Duration Out io.Writer ErrOut io.Writer Driver driver.Driver @@ -123,6 +124,7 @@ func NewDrainOptions( maxEvictRetries int32, pvDetachTimeout time.Duration, nodeName string, + nodeNotReadyDuration time.Duration, gracePeriodSeconds int, forceDeletePods bool, ignorePodsWithoutControllers bool, @@ -134,6 +136,9 @@ func NewDrainOptions( pvcLister corelisters.PersistentVolumeClaimLister, pvLister corelisters.PersistentVolumeLister, ) *DrainOptions { + if nodeNotReadyDuration == 0 { + nodeNotReadyDuration = 5 * time.Minute + } return &DrainOptions{ client: client, @@ -146,6 +151,7 @@ func NewDrainOptions( PvDetachTimeout: pvDetachTimeout, DeleteLocalData: deleteLocalData, nodeName: nodeName, + nodeNotReadyDuration: nodeNotReadyDuration, Out: out, ErrOut: errOut, Driver: driver, @@ -179,7 +185,23 @@ func (o *DrainOptions) RunDrain() error { return err } - err := o.deleteOrEvictPodsSimple() + node, err := o.client.CoreV1().Nodes().Get(o.nodeName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + klog.V(4).Infof("Node %q not found, skipping drain", o.nodeName) + return nil + } else if err != nil { + klog.Errorf("Error getting details for node: %q. Err: %v", o.nodeName, err) + return err + } + + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue && (time.Since(condition.LastTransitionTime.Time) > o.nodeNotReadyDuration) { + klog.Warningf("Skipping drain for NotReady Node %q", o.nodeName) + return nil + } + } + + err = o.deleteOrEvictPodsSimple() return err } diff --git a/pkg/controller/machine.go b/pkg/controller/machine.go index 6dc69cbfb..d108a7f9f 100644 --- a/pkg/controller/machine.go +++ b/pkg/controller/machine.go @@ -620,6 +620,7 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv maxEvictRetries, pvDetachTimeOut, nodeName, + 5*time.Minute, -1, forceDeletePods, true, diff --git a/pkg/controller/machine_test.go b/pkg/controller/machine_test.go index 33ae4a287..410dd00ed 100644 --- a/pkg/controller/machine_test.go +++ b/pkg/controller/machine_test.go @@ -812,6 +812,7 @@ var _ = Describe("machine", func() { machine string fakeProviderID string fakeNodeName string + nodeRecentlyNotReady *bool fakeDriverGetVMs func() (driver.VMs, error) fakeError error forceDeleteLabelPresent bool @@ -910,6 +911,26 @@ var _ = Describe("machine", func() { Expect(err).ToNot(HaveOccurred()) } + if data.action.nodeRecentlyNotReady != nil { + node, nodeErr := controller.targetCoreClient.CoreV1().Nodes().Get(machine.Status.Node, metav1.GetOptions{}) + Expect(nodeErr).To(Not(HaveOccurred())) + clone := node.DeepCopy() + newNodeCondition := corev1.NodeCondition{ + Type: v1.NodeReady, + Status: corev1.ConditionUnknown, + } + + if *data.action.nodeRecentlyNotReady { + newNodeCondition.LastTransitionTime = metav1.Time{Time: time.Now()} + } else { + newNodeCondition.LastTransitionTime = metav1.Time{Time: time.Now().Add(-time.Hour)} + } + + clone.Status.Conditions = []corev1.NodeCondition{newNodeCondition} + _, updateErr := controller.targetCoreClient.CoreV1().Nodes().UpdateStatus(clone) + Expect(updateErr).To(BeNil()) + } + if data.setup.fakeResourceActions != nil { trackers.TargetCore.SetFakeResourceActions(data.setup.fakeResourceActions, math.MaxInt32) } @@ -1287,6 +1308,85 @@ var _ = Describe("machine", func() { machineDeleted: false, }, }), + Entry("Machine force deletion if underlying Node is NotReady for a long time", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + aws: []*machinev1.AWSMachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: machinev1.AWSMachineClassSpec{ + SecretRef: newSecretReference(objMeta, 0), + }, + }, + }, + machines: newMachines(1, &machinev1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: machinev1.MachineSpec{ + Class: machinev1.ClassSpec{ + Kind: "AWSMachineClass", + Name: "machine-0", + }, + }, + }, nil, nil, nil, nil), + }, + action: action{ + machine: "machine-0", + fakeProviderID: "fakeID-0", + fakeNodeName: "fakeNode-0", + fakeError: nil, + nodeRecentlyNotReady: func() *bool { ret := false; return &ret }(), + }, + expect: expect{ + errOccurred: false, + machineDeleted: true, + }, + }), + Entry("Machine do not force deletion if underlying Node is NotReady for a small period of time, a Machine deletion fails, since kubelet fails to evict Pods", &data{ + setup: setup{ + secrets: []*corev1.Secret{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + }, + }, + aws: []*machinev1.AWSMachineClass{ + { + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: machinev1.AWSMachineClassSpec{ + SecretRef: newSecretReference(objMeta, 0), + }, + }, + }, + machines: newMachines(1, &machinev1.MachineTemplateSpec{ + ObjectMeta: *newObjectMeta(objMeta, 0), + Spec: machinev1.MachineSpec{ + Class: machinev1.ClassSpec{ + Kind: "AWSMachineClass", + Name: "machine-0", + }, + }, + }, nil, nil, nil, nil), + fakeResourceActions: &customfake.ResourceActions{ + Node: customfake.Actions{ + Update: "Failed to update nodes", + }, + }, + }, + action: action{ + machine: "machine-0", + fakeProviderID: "fakeID-0", + fakeNodeName: "fakeNode-0", + fakeError: nil, + nodeRecentlyNotReady: func() *bool { ret := true; return &ret }(), + }, + expect: expect{ + errOccurred: true, + machineDeleted: false, + }, + }), ) })