Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: Drain machines with only a valid NodeName #480

Merged
merged 4 commits into from
Jun 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,9 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
}
}

if machineID != "" {
if machineID != "" && nodeName != "" {
// Begin drain logic only when the nodeName & providerID exist's for the machine
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Begin drain logic only when the nodeName & providerID exist's for the machine
// Begin drain logic only when the nodeName & providerID exists for the machine

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this change also in OOT?


var (
forceDeletePods = false
forceDeleteMachine = false
Expand Down
53 changes: 52 additions & 1 deletion pkg/controller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,7 @@ var _ = Describe("machine", func() {
machine *machinev1.Machine
errOccurred bool
machineDeleted bool
nodeDeleted bool
}
type data struct {
setup setup
Expand Down Expand Up @@ -948,11 +949,14 @@ var _ = Describe("machine", func() {

if data.expect.machineDeleted {
Expect(machineErr).To(HaveOccurred())
Expect(nodeErr).To(HaveOccurred())
} else {
Expect(machineErr).ToNot(HaveOccurred())
Expect(machine).ToNot(BeNil())
}

if data.expect.nodeDeleted {
Expect(nodeErr).To(HaveOccurred())
} else {
Expect(nodeErr).ToNot(HaveOccurred())
Expect(node).ToNot(BeNil())
}
Expand Down Expand Up @@ -991,6 +995,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Allow proper deletion of the machine object when providerID is missing but actual VM still exists in cloud", &data{
Expand Down Expand Up @@ -1030,6 +1035,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine deletion when drain fails", &data{
Expand Down Expand Up @@ -1071,6 +1077,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: true,
machineDeleted: false,
nodeDeleted: false,
},
}),
Entry("Machine force deletion label is present", &data{
Expand Down Expand Up @@ -1108,6 +1115,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine force deletion label is present and when drain call fails (APIServer call fails)", &data{
Expand Down Expand Up @@ -1150,6 +1158,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine deletion when timeout occurred", &data{
Expand Down Expand Up @@ -1201,6 +1210,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine deletion when last drain failed", &data{
Expand Down Expand Up @@ -1251,6 +1261,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine deletion when last drain failed and current drain call also fails (APIServer call fails)", &data{
Expand Down Expand Up @@ -1306,6 +1317,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: true,
machineDeleted: false,
nodeDeleted: false,
},
}),
Entry("Machine force deletion if underlying Node is NotReady for a long time", &data{
Expand Down Expand Up @@ -1343,6 +1355,7 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: true,
},
}),
Entry("Machine do not force deletion if underlying Node is NotReady for a small period of time, a Machine deletion fails, since kubelet fails to evict Pods", &data{
Expand Down Expand Up @@ -1385,6 +1398,44 @@ var _ = Describe("machine", func() {
expect: expect{
errOccurred: true,
machineDeleted: false,
nodeDeleted: false,
},
}),
Entry("Allow machine object deletion where nodeName doesn't exist", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
aws: []*machinev1.AWSMachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.AWSMachineClassSpec{
SecretRef: newSecretReference(objMeta, 0),
},
},
},
machines: newMachines(1, &machinev1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: machinev1.MachineSpec{
Class: machinev1.ClassSpec{
Kind: "AWSMachineClass",
Name: "machine-0",
},
},
}, nil, nil, nil, nil),
},
action: action{
machine: "machine-0",
fakeProviderID: "fakeID-0",
fakeNodeName: "", //NodeName is set to emptyString
fakeError: nil,
},
expect: expect{
errOccurred: false,
machineDeleted: true,
nodeDeleted: false,
},
}),
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ import (
"k8s.io/utils/pointer"
)

func TestMachineControllerManagerSuite(t *testing.T) {
func TestMachineControllerSuite(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Machine Controller Manager Suite")
RunSpecs(t, "Machine Controller Suite")
}

var (
Expand Down
102 changes: 102 additions & 0 deletions pkg/util/provider/machinecontroller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1226,6 +1226,108 @@ var _ = Describe("machine", func() {
),
},
}),
Entry("Drain skipping as nodeName is not valid", &data{
setup: setup{
secrets: []*corev1.Secret{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
},
},
machineClasses: []*v1alpha1.MachineClass{
{
ObjectMeta: *newObjectMeta(objMeta, 0),
SecretRef: newSecretReference(objMeta, 0),
},
},
machines: newMachines(
1,
&v1alpha1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: v1alpha1.MachineSpec{
Class: v1alpha1.ClassSpec{
Kind: "MachineClass",
Name: "machine-0",
},
ProviderID: "fakeID",
},
},
&v1alpha1.MachineStatus{
Node: "fakeNode",
CurrentStatus: v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
LastUpdateTime: metav1.Now(),
},
LastOperation: v1alpha1.LastOperation{
Description: machineutils.InitiateDrain,
State: v1alpha1.MachineStateProcessing,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
},
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
LastTransitionTime: metav1.NewTime(time.Now().Add(-6 * time.Minute)),
},
},
},
nil,
map[string]string{
machineutils.MachinePriority: "3",
},
map[string]string{
"node": "",
},
true,
),
},
action: action{
machine: "machine-0",
fakeDriver: &driver.FakeDriver{
VMExists: true,
ProviderID: "fakeID-0",
NodeName: "",
Err: nil,
},
},
expect: expect{
err: fmt.Errorf("Skipping drain as nodeName is not a valid one for machine. Initiate VM deletion"),
retry: machineutils.RetryOp,
machine: newMachine(
&v1alpha1.MachineTemplateSpec{
ObjectMeta: *newObjectMeta(objMeta, 0),
Spec: v1alpha1.MachineSpec{
Class: v1alpha1.ClassSpec{
Kind: "MachineClass",
Name: "machine-0",
},
ProviderID: "fakeID",
},
},
&v1alpha1.MachineStatus{
Node: "fakeNode",
CurrentStatus: v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineTerminating,
LastUpdateTime: metav1.Now(),
},
LastOperation: v1alpha1.LastOperation{
Description: fmt.Sprintf("Skipping drain as nodeName is not a valid one for machine. Initiate VM deletion"),
State: v1alpha1.MachineStateProcessing,
Type: v1alpha1.MachineOperationDelete,
LastUpdateTime: metav1.Now(),
},
},
nil,
map[string]string{
machineutils.MachinePriority: "3",
},
map[string]string{
"node": "",
},
true,
),
},
}),
Entry("Drain skipping as machine is NotReady for a long time (5 minutes)", &data{
setup: setup{
secrets: []*corev1.Secret{
Expand Down
35 changes: 27 additions & 8 deletions pkg/util/provider/machinecontroller/machine_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -803,6 +803,16 @@ func (c *controller) getVMStatus(getMachineStatusRequest *driver.GetMachineStatu
return retry, err
}

// isValidNodeName checks if the nodeName is valid
func isValidNodeName(nodeName string) bool {
if nodeName == "" {
// if nodeName is empty
return false
}

return true
}

// drainNode attempts to drain the node backed by the machine object
func (c *controller) drainNode(deleteMachineRequest *driver.DeleteMachineRequest) (machineutils.Retry, error) {
var (
Expand All @@ -826,20 +836,29 @@ func (c *controller) drainNode(deleteMachineRequest *driver.DeleteMachineRequest
nodeNotReadyDuration = 5 * time.Minute
)

for _, condition := range machine.Status.Conditions {
if condition.Type == v1.NodeReady && condition.Status != corev1.ConditionTrue && (time.Since(condition.LastTransitionTime.Time) > nodeNotReadyDuration) {
klog.Warningf("Skipping drain for NotReady machine %q", machine.Name)
err = fmt.Errorf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion)
skipDrain = true
if !isValidNodeName(nodeName) {
klog.Warningf("Skipping drain as nodeName is not a valid one for machine %q", machine.Name)
err = fmt.Errorf("Skipping drain as nodeName is not a valid one for machine. %s", machineutils.InitiateVMDeletion)
description = fmt.Sprintf("Skipping drain as nodeName is not a valid one for machine. %s", machineutils.InitiateVMDeletion)
skipDrain = true
} else {
for _, condition := range machine.Status.Conditions {
if condition.Type == v1.NodeReady {
if condition.Status != corev1.ConditionTrue && (time.Since(condition.LastTransitionTime.Time) > nodeNotReadyDuration) {
klog.Warningf("Skipping drain for NotReady machine %q", machine.Name)
err = fmt.Errorf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion)
description = fmt.Sprintf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion)
skipDrain = true
}
// break once the condition is found
break
}
}
}

if skipDrain {
// If not is not ready for over 5 minutes, skip draining this machine
description = fmt.Sprintf("Skipping drain as machine is NotReady for over 5minutes. %s", machineutils.InitiateVMDeletion)
state = v1alpha1.MachineStateProcessing
phase = v1alpha1.MachineTerminating

} else {
// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time)
Expand Down