Skip to content

Commit

Permalink
Introduce constant backoff while enqueuing machine objects on creatio…
Browse files Browse the repository at this point in the history
…n and deletion failures

Co-authored-by: Prashanth <prashanth@sap.com>
  • Loading branch information
hardikdr and prashanth26 committed Oct 5, 2020
1 parent abf38c8 commit b5a6571
Show file tree
Hide file tree
Showing 7 changed files with 250 additions and 36 deletions.
3 changes: 3 additions & 0 deletions pkg/apis/machine/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ const (

// MachineFailed means operation failed leading to machine status failure
MachineFailed MachinePhase = "Failed"

// MachineCrashLoopBackOff means creation or deletion of the machine is failing.
MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff"
)

// MachinePhase is a label for the condition of a machines at the current time.
Expand Down
5 changes: 4 additions & 1 deletion pkg/apis/machine/v1alpha1/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,12 @@ const (

// MachineFailed means operation failed leading to machine status failure
MachineFailed MachinePhase = "Failed"

// MachineCrashLoopBackOff means creation or deletion of the machine is failing.
MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff"
)

// MachinePhase is a label for the condition of a machines at the current time.
// MachineState is a current state of the machine.
type MachineState string

// These are the valid statuses of machines.
Expand Down
13 changes: 7 additions & 6 deletions pkg/controller/controller_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -767,12 +767,13 @@ func (s ActiveMachines) Less(i, j int) bool {
// the lower the priority, the more likely
// it is to be deleted
m := map[v1alpha1.MachinePhase]int{
v1alpha1.MachineTerminating: 0,
v1alpha1.MachineFailed: 1,
v1alpha1.MachineUnknown: 2,
v1alpha1.MachinePending: 3,
v1alpha1.MachineAvailable: 4,
v1alpha1.MachineRunning: 5,
v1alpha1.MachineTerminating: 0,
v1alpha1.MachineFailed: 1,
v1alpha1.MachineCrashLoopBackOff: 2,
v1alpha1.MachineUnknown: 3,
v1alpha1.MachinePending: 4,
v1alpha1.MachineAvailable: 5,
v1alpha1.MachineRunning: 6,
}

// Case-1: Initially we try to prioritize machine deletion based on
Expand Down
2 changes: 1 addition & 1 deletion pkg/controller/deployment_machineset_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al
// Count the number of machines that have labels matching the labels of the machine
// template of the machine set, the matching machines may have more
// labels than are in the template. Because the label of machineTemplateSpec is
// a supeiset of the selector of the machine set, so the possible
// a superset of the selector of the machine set, so the possible
// matching machines must be part of the filteredmachines.
fullyLabeledReplicasCount := 0
readyReplicasCount := 0
Expand Down
131 changes: 106 additions & 25 deletions pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ import (

"k8s.io/klog"

corev1 "k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/cache"

apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

Expand All @@ -48,6 +50,9 @@ const (
// priority the more likely it is to be deleted first
// Default priority for a machine is set to 3
MachinePriority = "machinepriority.machine.sapcloud.io"

// MachineEnqueueRetryPeriod period after which a machine object is re-enqueued upon creation or deletion failures.
MachineEnqueueRetryPeriod = 2 * time.Minute
)

/*
Expand Down Expand Up @@ -158,7 +163,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error {
// Validate MachineClass
MachineClass, secretRef, err := c.validateMachineClass(&machine.Spec.Class)
if err != nil || secretRef == nil {
return err
c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod)
return nil
}

driver := driver.NewDriver(machine.Spec.ProviderID, secretRef, machine.Spec.Class.Kind, MachineClass, machine.Name)
Expand Down Expand Up @@ -200,7 +206,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error {
if machine.DeletionTimestamp != nil {
// Processing of delete event
if err := c.machineDelete(machine, driver); err != nil {
return err
c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod)
return nil
}
} else if machine.Status.CurrentStatus.TimeoutActive {
// Processing machine
Expand All @@ -213,7 +220,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error {
return nil
} else if actualProviderID == "" {
if err := c.machineCreate(machine, driver); err != nil {
return err
c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod)
return nil
}
} else if actualProviderID != machine.Spec.ProviderID {
if err := c.machineUpdate(machine, actualProviderID); err != nil {
Expand All @@ -231,6 +239,12 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error {
*/
func (c *controller) addNodeToMachine(obj interface{}) {

node := obj.(*corev1.Node)
if node == nil {
klog.Errorf("Couldn't convert to node from object")
return
}

key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("Couldn't get key for object %+v: %v", obj, err)
Expand All @@ -245,16 +259,39 @@ func (c *controller) addNodeToMachine(obj interface{}) {
return
}

klog.V(4).Infof("Add machine object backing node %q", machine.Name)
c.enqueueMachine(machine)
if machine.Status.CurrentStatus.Phase != v1alpha1.MachineCrashLoopBackOff && nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) {
klog.V(2).Infof("Enqueue machine object %q as backing node's conditions have changed", machine.Name)
c.enqueueMachine(machine)
}
}

func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) {
c.addNodeToMachine(newObj)
}

func (c *controller) deleteNodeToMachine(obj interface{}) {
c.addNodeToMachine(obj)

node := obj.(*corev1.Node)
if node == nil {
klog.Errorf("Couldn't convert to node from object")
return
}

key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
if err != nil {
klog.Errorf("Couldn't get key for object %+v: %v", obj, err)
return
}

machine, err := c.getMachineFromNode(key)
if err != nil {
klog.Errorf("Couldn't fetch machine %s, Error: %s", key, err)
return
} else if machine == nil {
return
}

c.enqueueMachine(machine)
}

/*
Expand Down Expand Up @@ -411,6 +448,26 @@ func (c *controller) machineCreate(machine *v1alpha1.Machine, driver driver.Driv
// Before actually creating the machine, we should once check and adopt if the virtual machine already exists.
VMList, err := driver.GetVMs("")
if err != nil {

klog.Errorf("Error while listing machine %s: %s", machine.Name, err.Error())
lastOperation := v1alpha1.LastOperation{
Description: "Cloud provider message - " + err.Error(),
State: v1alpha1.MachineStateFailed,
Type: v1alpha1.MachineOperationCreate,
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineCrashLoopBackOff,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}
c.updateMachineStatus(machine, lastOperation, currentStatus)

// Delete the bootstrap token
if err := c.deleteBootstrapToken(machine.Name); err != nil {
klog.Warning(err)
}

klog.Errorf("Failed to list VMs before creating machine %q %+v", machine.Name, err)
return err
}
Expand All @@ -433,7 +490,7 @@ func (c *controller) machineCreate(machine *v1alpha1.Machine, driver driver.Driv
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineFailed,
Phase: v1alpha1.MachineCrashLoopBackOff,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}
Expand Down Expand Up @@ -692,22 +749,23 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
klog.V(2).Infof("Machine %q on deletion doesn't have a providerID attached to it. Checking for any VM linked to this machine object.", machine.Name)
// As MachineID is missing, we should check once if actual VM was created but MachineID was not updated on machine-object.
// We list VMs and check if any one them map with the given machine-object.
VMList, err := driver.GetVMs("")
if err != nil {
klog.Errorf("Failed to list VMs while deleting the machine %q %v", machine.Name, err)
return err
}
for providerID, machineName := range VMList {
if machineName == machine.Name {
klog.V(2).Infof("Deleting the VM %s backing the machine-object %s.", providerID, machine.Name)
err = driver.Delete(providerID)
if err != nil {
klog.Errorf("Error deleting the VM %s backing the machine-object %s due to error %v", providerID, machine.Name, err)
// Not returning error so that status on the machine object can be updated in the next step if errored.
} else {
klog.V(2).Infof("VM %s backing the machine-object %s is deleted successfully", providerID, machine.Name)
var VMList map[string]string
VMList, err = driver.GetVMs("")
if err == nil {
for providerID, machineName := range VMList {
if machineName == machine.Name {
klog.V(2).Infof("Deleting the VM %s backing the machine-object %s.", providerID, machine.Name)
err = driver.Delete(providerID)
if err != nil {
klog.Errorf("Error deleting the VM %s backing the machine-object %s due to error %v", providerID, machine.Name, err)
// Not returning error so that status on the machine object can be updated in the next step if errored.
} else {
klog.V(2).Infof("VM %s backing the machine-object %s is deleted successfully", providerID, machine.Name)
}
}
}
} else {
klog.Errorf("Failed to list VMs while deleting the machine %q %v", machine.Name, err)
}
}

Expand All @@ -722,7 +780,7 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv
LastUpdateTime: metav1.Now(),
}
currentStatus := v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineFailed,
Phase: v1alpha1.MachineTerminating,
TimeoutActive: false,
LastUpdateTime: metav1.Now(),
}
Expand Down Expand Up @@ -775,14 +833,18 @@ func (c *controller) updateMachineStatus(
currentStatus v1alpha1.CurrentStatus,
) (*v1alpha1.Machine, error) {
// Get the latest version of the machine so that we can avoid conflicts
clone, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
latestMachine, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{})
if err != nil {
return machine, err
return nil, err
}
clone := latestMachine.DeepCopy()

clone = clone.DeepCopy()
clone.Status.LastOperation = lastOperation
clone.Status.CurrentStatus = currentStatus
if isMachineStatusEqual(clone.Status, machine.Status) {
klog.V(3).Infof("Not updating the status of the machine object %q , as it is already same", clone.Name)
return machine, nil
}

clone, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone)
if err != nil {
Expand All @@ -793,6 +855,25 @@ func (c *controller) updateMachineStatus(
return clone, nil
}

// isMachineStatusEqual checks if the status of 2 machines is similar or not.
func isMachineStatusEqual(s1, s2 v1alpha1.MachineStatus) bool {
tolerateTimeDiff := 30 * time.Minute
s1Copy, s2Copy := s1.DeepCopy(), s2.DeepCopy()
s1Copy.LastOperation.Description, s2Copy.LastOperation.Description = "", ""

if (s1Copy.LastOperation.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) || (s2Copy.LastOperation.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) {
return false
}
s1Copy.LastOperation.LastUpdateTime, s2Copy.LastOperation.LastUpdateTime = metav1.Time{}, metav1.Time{}

if (s1Copy.CurrentStatus.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) || (s2Copy.CurrentStatus.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) {
return false
}
s1Copy.CurrentStatus.LastUpdateTime, s2Copy.CurrentStatus.LastUpdateTime = metav1.Time{}, metav1.Time{}

return apiequality.Semantic.DeepEqual(s1Copy.LastOperation, s2Copy.LastOperation) && apiequality.Semantic.DeepEqual(s1Copy.CurrentStatus, s2Copy.CurrentStatus)
}

func (c *controller) updateMachineConditions(machine *v1alpha1.Machine, conditions []v1.NodeCondition) (*v1alpha1.Machine, error) {

var (
Expand Down
Loading

0 comments on commit b5a6571

Please sign in to comment.