Skip to content

Commit

Permalink
Bugfix: Drain machines with only a valid NodeName (#480)
Browse files Browse the repository at this point in the history
* Bugfix: Drain machines with only a valid NodeName

* Extend fix to OOT provider

* Fixed unit test

* Made suggested change
  • Loading branch information
prashanth26 authored Jun 30, 2020
1 parent fba8a1b commit 8881b03
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 12 deletions.
4 changes: 3 additions & 1 deletion pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,9 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
}
}

if machineID != "" {
if machineID != "" && nodeName != "" {
// Begin drain logic only when the nodeName & providerID exist's for the machine

var (
forceDeletePods = false
forceDeleteMachine = false
Expand Down
53 changes: 52 additions & 1 deletion pkg/controller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,7 @@ var _ = Describe("machine", func() {
machine *machinev1.Machine
errOccurred bool
machineDeleted bool
nodeDeleted bool
}
type data struct {
setup setup
Expand Down Expand Up @@ -948,11 +949,14 @@ var _ = Describe("machine", func() {

if data.expect.machineDeleted {
Expect(machineErr).To(HaveOccurred())
Expect(nodeErr).To(HaveOccurred())
} else {
Expect(machineErr).ToNot(HaveOccurred())
Expect(machine).ToNot(BeNil())
}

if data.expect.nodeDeleted {
Expect(nodeErr).To(HaveOccurred())
} else {
Expect(nodeErr).ToNot(HaveOccurred())
Expect(node).ToNot(BeNil())
}
Expand Down Expand Up @@ -991,6 +995,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Allow proper deletion of the machine object when providerID is missing but actual VM still exists in cloud", &data{
Expand Down Expand Up @@ -1030,6 +1035,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine deletion when drain fails", &data{
Expand Down Expand Up @@ -1071,6 +1077,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: true,
machineDeleted: false,
nodeDeleted: false,
},
}),
Entry("Machine force deletion label is present", &data{
Expand Down Expand Up @@ -1108,6 +1115,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine force deletion label is present and when drain call fails (APIServer call fails)", &data{
Expand Down Expand Up @@ -1150,6 +1158,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine deletion when timeout occurred", &data{
Expand Down Expand Up @@ -1201,6 +1210,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine deletion when last drain failed", &data{
Expand Down Expand Up @@ -1251,6 +1261,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine deletion when last drain failed and current drain call also fails (APIServer call fails)", &data{
Expand Down Expand Up @@ -1306,6 +1317,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: true,
machineDeleted: false,
nodeDeleted: false,
},
}),
Entry("Machine force deletion if underlying Node is NotReady for a long time", &data{
Expand Down Expand Up @@ -1343,6 +1355,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine do not force deletion if underlying Node is NotReady for a small period of time, a Machine deletion fails, since kubelet fails to evict Pods", &data{
Expand Down Expand Up @@ -1385,6 +1398,44 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: true,
machineDeleted: false,
nodeDeleted: false,
},
}),
Entry("Allow machine object deletion where nodeName doesn't exist", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "", //NodeName is set to emptyString
fakeError: nil,
},
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: false,
},
}),
)
Expand Down
4 changes: 2 additions & 2 deletions pkg/util/provider/machinecontroller/controller_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ import (
"k8s.io/utils/pointer"
)

func TestMachineControllerManagerSuite(t *testing.T) {
func TestMachineControllerSuite(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Machine Controller Manager Suite")
RunSpecs(t, "Machine Controller Suite")
}

var (
Expand Down
102 changes: 102 additions & 0 deletions pkg/util/provider/machinecontroller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1226,6 +1226,108 @@ var _ = Describe("machine", func() {
),
},
}),
Entry("Drain skipping as nodeName is not valid", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
machineClasses: []*v1alpha1.MachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
SecretRef: newSecretReference(objMeta, 0),
},
},
machines: newMachines(
1,
&v1alpha1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: v1alpha1.MachineSpec{
Class: v1alpha1.ClassSpec{
Kind: "MachineClass",
Name: "machine-0",
},
ProviderID: "fakeID",
},
},
&v1alpha1.MachineStatus{
Node: "fakeNode",
CurrentStatus: v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
LastUpdateTime: metav1.Now(),
},
LastOperation: v1alpha1.LastOperation{
Description: machineutils.InitiateDrain,
State: v1alpha1.MachineStateProcessing,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
LastTransitionTime: metav1.NewTime(time.Now().Add(-6 * time.Minute)),
},
},
},
nil,
map[string]string{
machineutils.MachinePriority: "3",
},
map[string]string{
"node": "",
},
true,
),
},
action: action{
machine: "machine-0",
fakeDriver: &driver.FakeDriver{
VMExists: true,
ProviderID: "fakeID-0",
NodeName: "",
Err: nil,
},
},
expect: expect{
err: fmt.Errorf("Skipping drain as nodeName is not a valid one for machine. Initiate VM deletion"),
retry: machineutils.RetryOp,
machine: newMachine(
&v1alpha1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: v1alpha1.MachineSpec{
Class: v1alpha1.ClassSpec{
Kind: "MachineClass",
Name: "machine-0",
},
ProviderID: "fakeID",
},
},
&v1alpha1.MachineStatus{
Node: "fakeNode",
CurrentStatus: v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
LastUpdateTime: metav1.Now(),
},
LastOperation: v1alpha1.LastOperation{
Description: fmt.Sprintf("Skipping drain as nodeName is not a valid one for machine. Initiate VM deletion"),
State: v1alpha1.MachineStateProcessing,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
},
},
nil,
map[string]string{
machineutils.MachinePriority: "3",
},
map[string]string{
"node": "",
},
true,
),
},
}),
Entry("Drain skipping as machine is NotReady for a long time (5 minutes)", &data{
setup: setup{
secrets: []*corev1.Secret{
Expand Down
35 changes: 27 additions & 8 deletions pkg/util/provider/machinecontroller/machine_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,16 @@ func (c *controller) getVMStatus(getMachineStatusRequest *driver.GetMachineStatu
return retry, err
}

// isValidNodeName checks if the nodeName is valid
func isValidNodeName(nodeName string) bool {
if nodeName == "" {
// if nodeName is empty
return false
}

return true
}

// drainNode attempts to drain the node backed by the machine object
func (c *controller) drainNode(deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.Retry, error) {
var (
Expand All @@ -826,20 +836,29 @@ func (c *controller) drainNode(deleteMachineRequest *driver.DeleteMachineRequest
nodeNotReadyDuration = 5 * time.Minute
)

for _, condition := range machine.Status.Conditions {
if condition.Type == v1.NodeReady && condition.Status != corev1.ConditionTrue && (time.Since(condition.LastTransitionTime.Time) > nodeNotReadyDuration) {
klog.Warningf("Skipping drain for NotReady machine %q", machine.Name)
err = fmt.Errorf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion)
skipDrain = true
if !isValidNodeName(nodeName) {
klog.Warningf("Skipping drain as nodeName is not a valid one for machine %q", machine.Name)
err = fmt.Errorf("Skipping drain as nodeName is not a valid one for machine. %s", machineutils.InitiateVMDeletion)
description = fmt.Sprintf("Skipping drain as nodeName is not a valid one for machine. %s", machineutils.InitiateVMDeletion)
skipDrain = true
} else {
for _, condition := range machine.Status.Conditions {
if condition.Type == v1.NodeReady {
if condition.Status != corev1.ConditionTrue && (time.Since(condition.LastTransitionTime.Time) > nodeNotReadyDuration) {
klog.Warningf("Skipping drain for NotReady machine %q", machine.Name)
err = fmt.Errorf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion)
description = fmt.Sprintf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion)
skipDrain = true
}
// break once the condition is found
break
}
}
}

if skipDrain {
// If not is not ready for over 5 minutes, skip draining this machine
description = fmt.Sprintf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion)
state = v1alpha1.MachineStateProcessing
phase = v1alpha1.MachineTerminating

} else {
// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time)
Expand Down

0 comments on commit 8881b03

Please sign in to comment.