Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce delay in re-enqueuing machines during creation--deletion failures. #523

Merged
merged 1 commit into from
Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pkg/apis/machine/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ const (

// MachineFailed means operation failed leading to machine status failure
MachineFailed MachinePhase = "Failed"

// MachineCrashLoopBackOff means creation or deletion of the machine is failing.
hardikdr marked this conversation as resolved.
Show resolved Hide resolved
MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff"
)

// MachinePhase is a label for the condition of a machines at the current time.
Expand Down
5 changes: 4 additions & 1 deletion pkg/apis/machine/v1alpha1/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,12 @@ const (

// MachineFailed means operation failed leading to machine status failure
MachineFailed MachinePhase = "Failed"

// MachineCrashLoopBackOff means creation or deletion of the machine is failing.
hardikdr marked this conversation as resolved.
Show resolved Hide resolved
MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff"
)

// MachinePhase is a label for the condition of a machines at the current time.
// MachineState is a current state of the machine.
type MachineState string

// These are the valid statuses of machines.
Expand Down
13 changes: 7 additions & 6 deletions pkg/controller/controller_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -767,12 +767,13 @@ func (s ActiveMachines) Less(i, j int) bool {
// the lower the priority, the more likely
// it is to be deleted
m := map[v1alpha1.MachinePhase]int{
v1alpha1.MachineTerminating: 0,
v1alpha1.MachineFailed: 1,
v1alpha1.MachineUnknown: 2,
v1alpha1.MachinePending: 3,
v1alpha1.MachineAvailable: 4,
v1alpha1.MachineRunning: 5,
v1alpha1.MachineTerminating: 0,
v1alpha1.MachineFailed: 1,
v1alpha1.MachineCrashLoopBackOff: 2,
v1alpha1.MachineUnknown: 3,
v1alpha1.MachinePending: 4,
v1alpha1.MachineAvailable: 5,
v1alpha1.MachineRunning: 6,
}

// Case-1: Initially we try to prioritize machine deletion based on
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/deployment_machineset_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
// Count the number of machines that have labels matching the labels of the machine
// template of the machine set, the matching machines may have more
// labels than are in the template. Because the label of machineTemplateSpec is
// a supeiset of the selector of the machine set, so the possible
// a superset of the selector of the machine set, so the possible
// matching machines must be part of the filteredmachines.
fullyLabeledReplicasCount := 0
readyReplicasCount := 0
Expand Down
131 changes: 106 additions & 25 deletions pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ import (

"k8s.io/klog"

corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/cache"

apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

Expand All @@ -48,6 +50,9 @@ const (
// priority the more likely it is to be deleted first
// Default priority for a machine is set to 3
MachinePriority = "machinepriority.machine.sapcloud.io"

// MachineEnqueueRetryPeriod period after which a machine object is re-enqueued upon creation or deletion failures.
MachineEnqueueRetryPeriod = 2 * time.Minute
)

/*
Expand Down Expand Up @@ -158,7 +163,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error {
// Validate MachineClass
MachineClass, secretRef, err := c.validateMachineClass(&machine.Spec.Class)
if err != nil || secretRef == nil {
return err
c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod)
return nil
}

driver := driver.NewDriver(machine.Spec.ProviderID, secretRef, machine.Spec.Class.Kind, MachineClass, machine.Name)
Expand Down Expand Up @@ -200,7 +206,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error {
if machine.DeletionTimestamp != nil {
// Processing of delete event
if err := c.machineDelete(machine, driver); err != nil {
return err
c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod)
return nil
}
} else if machine.Status.CurrentStatus.TimeoutActive {
// Processing machine
Expand All @@ -213,7 +220,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error {
return nil
} else if actualProviderID == "" {
if err := c.machineCreate(machine, driver); err != nil {
return err
c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod)
return nil
}
} else if actualProviderID != machine.Spec.ProviderID {
if err := c.machineUpdate(machine, actualProviderID); err != nil {
Expand All @@ -231,6 +239,12 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error {
*/
func (c *controller) addNodeToMachine(obj interface{}) {

node := obj.(*corev1.Node)
if node == nil {
klog.Errorf("Couldn't convert to node from object")
return
}

key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("Couldn't get key for object %+v: %v", obj, err)
Expand All @@ -245,16 +259,39 @@ func (c *controller) addNodeToMachine(obj interface{}) {
return
}

klog.V(4).Infof("Add machine object backing node %q", machine.Name)
c.enqueueMachine(machine)
if machine.Status.CurrentStatus.Phase != v1alpha1.MachineCrashLoopBackOff && nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) {
klog.V(2).Infof("Enqueue machine object %q as backing node's conditions have changed", machine.Name)
c.enqueueMachine(machine)
}
}

func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) {
c.addNodeToMachine(newObj)
}

func (c *controller) deleteNodeToMachine(obj interface{}) {
c.addNodeToMachine(obj)

node := obj.(*corev1.Node)
if node == nil {
klog.Errorf("Couldn't convert to node from object")
return
}

key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("Couldn't get key for object %+v: %v", obj, err)
return
}

machine, err := c.getMachineFromNode(key)
if err != nil {
klog.Errorf("Couldn't fetch machine %s, Error: %s", key, err)
return
} else if machine == nil {
return
}

c.enqueueMachine(machine)
}

/*
Expand Down Expand Up @@ -411,6 +448,26 @@ func (c *controller) machineCreate(machine *v1alpha1.Machine, driver driver.Driv
// Before actually creating the machine, we should once check and adopt if the virtual machine already exists.
VMList, err := driver.GetVMs("")
if err != nil {

klog.Errorf("Error while listing machine %s: %s", machine.Name, err.Error())
lastOperation := v1alpha1.LastOperation{
Description: "Cloud provider message - " + err.Error(),
State: v1alpha1.MachineStateFailed,
Type: v1alpha1.MachineOperationCreate,
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineCrashLoopBackOff,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}
c.updateMachineStatus(machine, lastOperation, currentStatus)

// Delete the bootstrap token
if err := c.deleteBootstrapToken(machine.Name); err != nil {
klog.Warning(err)
}

klog.Errorf("Failed to list VMs before creating machine %q %+v", machine.Name, err)
return err
}
Expand All @@ -433,7 +490,7 @@ func (c *controller) machineCreate(machine *v1alpha1.Machine, driver driver.Driv
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineFailed,
Phase: v1alpha1.MachineCrashLoopBackOff,
prashanth26 marked this conversation as resolved.
Show resolved Hide resolved
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}
Expand Down Expand Up @@ -692,22 +749,23 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
klog.V(2).Infof("Machine %q on deletion doesn't have a providerID attached to it. Checking for any VM linked to this machine object.", machine.Name)
// As MachineID is missing, we should check once if actual VM was created but MachineID was not updated on machine-object.
// We list VMs and check if any one them map with the given machine-object.
VMList, err := driver.GetVMs("")
if err != nil {
klog.Errorf("Failed to list VMs while deleting the machine %q %v", machine.Name, err)
return err
}
for providerID, machineName := range VMList {
if machineName == machine.Name {
klog.V(2).Infof("Deleting the VM %s backing the machine-object %s.", providerID, machine.Name)
err = driver.Delete(providerID)
if err != nil {
klog.Errorf("Error deleting the VM %s backing the machine-object %s due to error %v", providerID, machine.Name, err)
// Not returning error so that status on the machine object can be updated in the next step if errored.
} else {
klog.V(2).Infof("VM %s backing the machine-object %s is deleted successfully", providerID, machine.Name)
var VMList map[string]string
VMList, err = driver.GetVMs("")
if err == nil {
for providerID, machineName := range VMList {
if machineName == machine.Name {
klog.V(2).Infof("Deleting the VM %s backing the machine-object %s.", providerID, machine.Name)
err = driver.Delete(providerID)
if err != nil {
klog.Errorf("Error deleting the VM %s backing the machine-object %s due to error %v", providerID, machine.Name, err)
// Not returning error so that status on the machine object can be updated in the next step if errored.
} else {
klog.V(2).Infof("VM %s backing the machine-object %s is deleted successfully", providerID, machine.Name)
}
}
}
} else {
klog.Errorf("Failed to list VMs while deleting the machine %q %v", machine.Name, err)
}
}

Expand All @@ -722,7 +780,7 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineFailed,
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}
Expand Down Expand Up @@ -775,14 +833,18 @@ func (c *controller) updateMachineStatus(
currentStatus v1alpha1.CurrentStatus,
) (*v1alpha1.Machine, error) {
// Get the latest version of the machine so that we can avoid conflicts
clone, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
latestMachine, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
if err != nil {
return machine, err
return nil, err
}
clone := latestMachine.DeepCopy()

clone = clone.DeepCopy()
clone.Status.LastOperation = lastOperation
clone.Status.CurrentStatus = currentStatus
if isMachineStatusEqual(clone.Status, machine.Status) {
klog.V(3).Infof("Not updating the status of the machine object %q , as it is already same", clone.Name)
hardikdr marked this conversation as resolved.
Show resolved Hide resolved
return machine, nil
}

clone, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone)
if err != nil {
Expand All @@ -793,6 +855,25 @@ func (c *controller) updateMachineStatus(
return clone, nil
}

// isMachineStatusEqual checks if the status of 2 machines is similar or not.
func isMachineStatusEqual(s1, s2 v1alpha1.MachineStatus) bool {
tolerateTimeDiff := 30 * time.Minute
s1Copy, s2Copy := s1.DeepCopy(), s2.DeepCopy()
s1Copy.LastOperation.Description, s2Copy.LastOperation.Description = "", ""

if (s1Copy.LastOperation.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) || (s2Copy.LastOperation.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) {
return false
}
s1Copy.LastOperation.LastUpdateTime, s2Copy.LastOperation.LastUpdateTime = metav1.Time{}, metav1.Time{}

if (s1Copy.CurrentStatus.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) || (s2Copy.CurrentStatus.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) {
return false
}
s1Copy.CurrentStatus.LastUpdateTime, s2Copy.CurrentStatus.LastUpdateTime = metav1.Time{}, metav1.Time{}
hardikdr marked this conversation as resolved.
Show resolved Hide resolved

return apiequality.Semantic.DeepEqual(s1Copy.LastOperation, s2Copy.LastOperation) && apiequality.Semantic.DeepEqual(s1Copy.CurrentStatus, s2Copy.CurrentStatus)
}

func (c *controller) updateMachineConditions(machine *v1alpha1.Machine, conditions []v1.NodeCondition) (*v1alpha1.Machine, error) {

var (
Expand Down
Loading