Skip to content

Commit

Permalink
Merge pull request #450 from flant/skip-drain-if-not-ready
Browse files Browse the repository at this point in the history
Skip drain on NotReady Nodes
  • Loading branch information
hardikdr authored Apr 28, 2020
2 parents 1b1a9fa + fa7fa31 commit 8ec2a7f
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 1 deletion.
24 changes: 23 additions & 1 deletion pkg/controller/drain.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ type DrainOptions struct {
PvDetachTimeout time.Duration
DeleteLocalData bool
nodeName string
nodeNotReadyDuration time.Duration
Out io.Writer
ErrOut io.Writer
Driver driver.Driver
Expand Down Expand Up @@ -123,6 +124,7 @@ func NewDrainOptions(
maxEvictRetries int32,
pvDetachTimeout time.Duration,
nodeName string,
nodeNotReadyDuration time.Duration,
gracePeriodSeconds int,
forceDeletePods bool,
ignorePodsWithoutControllers bool,
Expand All @@ -134,6 +136,9 @@ func NewDrainOptions(
pvcLister corelisters.PersistentVolumeClaimLister,
pvLister corelisters.PersistentVolumeLister,
) *DrainOptions {
if nodeNotReadyDuration == 0 {
nodeNotReadyDuration = 5 * time.Minute
}

return &DrainOptions{
client: client,
Expand All @@ -146,6 +151,7 @@ func NewDrainOptions(
PvDetachTimeout: pvDetachTimeout,
DeleteLocalData: deleteLocalData,
nodeName: nodeName,
nodeNotReadyDuration: nodeNotReadyDuration,
Out: out,
ErrOut: errOut,
Driver: driver,
Expand Down Expand Up @@ -179,7 +185,23 @@ func (o *DrainOptions) RunDrain() error {
return err
}

err := o.deleteOrEvictPodsSimple()
node, err := o.client.CoreV1().Nodes().Get(o.nodeName, metav1.GetOptions{})
if apierrors.IsNotFound(err) {
klog.V(4).Infof("Node %q not found, skipping drain", o.nodeName)
return nil
} else if err != nil {
klog.Errorf("Error getting details for node: %q. Err: %v", o.nodeName, err)
return err
}

for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue && (time.Since(condition.LastTransitionTime.Time) > o.nodeNotReadyDuration) {
klog.Warningf("Skipping drain for NotReady Node %q", o.nodeName)
return nil
}
}

err = o.deleteOrEvictPodsSimple()
return err
}

Expand Down
1 change: 1 addition & 0 deletions pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
maxEvictRetries,
pvDetachTimeOut,
nodeName,
5*time.Minute,
-1,
forceDeletePods,
true,
Expand Down
100 changes: 100 additions & 0 deletions pkg/controller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -812,6 +812,7 @@ var _ = Describe("machine", func() {
machine string
fakeProviderID string
fakeNodeName string
nodeRecentlyNotReady *bool
fakeDriverGetVMs func() (driver.VMs, error)
fakeError error
forceDeleteLabelPresent bool
Expand Down Expand Up @@ -910,6 +911,26 @@ var _ = Describe("machine", func() {
Expect(err).ToNot(HaveOccurred())
}

if data.action.nodeRecentlyNotReady != nil {
node, nodeErr := controller.targetCoreClient.CoreV1().Nodes().Get(machine.Status.Node, metav1.GetOptions{})
Expect(nodeErr).To(Not(HaveOccurred()))
clone := node.DeepCopy()
newNodeCondition := corev1.NodeCondition{
Type: v1.NodeReady,
Status: corev1.ConditionUnknown,
}

if *data.action.nodeRecentlyNotReady {
newNodeCondition.LastTransitionTime = metav1.Time{Time: time.Now()}
} else {
newNodeCondition.LastTransitionTime = metav1.Time{Time: time.Now().Add(-time.Hour)}
}

clone.Status.Conditions = []corev1.NodeCondition{newNodeCondition}
_, updateErr := controller.targetCoreClient.CoreV1().Nodes().UpdateStatus(clone)
Expect(updateErr).To(BeNil())
}

if data.setup.fakeResourceActions != nil {
trackers.TargetCore.SetFakeResourceActions(data.setup.fakeResourceActions, math.MaxInt32)
}
Expand Down Expand Up @@ -1287,6 +1308,85 @@ var _ = Describe("machine", func() {
machineDeleted: false,
},
}),
Entry("Machine force deletion if underlying Node is NotReady for a long time", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
nodeRecentlyNotReady: func() *bool { ret := false; return &ret }(),
},
expect: expect{
errOccurred: false,
machineDeleted: true,
},
}),
Entry("Machine do not force deletion if underlying Node is NotReady for a small period of time, a Machine deletion fails, since kubelet fails to evict Pods", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
fakeResourceActions: &customfake.ResourceActions{
Node: customfake.Actions{
Update: "Failed to update nodes",
},
},
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "fakeNode-0",
fakeError: nil,
nodeRecentlyNotReady: func() *bool { ret := true; return &ret }(),
},
expect: expect{
errOccurred: true,
machineDeleted: false,
},
}),
)
})

Expand Down

0 comments on commit 8ec2a7f

Please sign in to comment.