Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inhibit scale-down by autoscaler during roll-outs. #496

Merged
merged 3 commits into from
Sep 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/machine-controller-manager/app/controllermanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ func StartControllers(s *options.MCMServer,
s.NodeConditions,
s.BootstrapTokenAuthExtraGroups,
s.DeleteMigratedMachineClass,
s.AutoscalerScaleDownAnnotationDuringRollout,
)
if err != nil {
return err
Expand Down
3 changes: 3 additions & 0 deletions cmd/machine-controller-manager/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func NewMCMServer() *MCMServer {
KubeAPIBurst: 30,
LeaderElection: leaderelectionconfig.DefaultLeaderElectionConfiguration(),
ControllerStartInterval: metav1.Duration{Duration: 0 * time.Second},
AutoscalerScaleDownAnnotationDuringRollout: true,
SafetyOptions: machineconfig.SafetyOptions{
SafetyUp: 2,
SafetyDown: 1,
Expand Down Expand Up @@ -115,6 +116,8 @@ func (s *MCMServer) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&s.BootstrapTokenAuthExtraGroups, "bootstrap-token-auth-extra-groups", s.BootstrapTokenAuthExtraGroups, "Comma-separated list of groups to set bootstrap token's \"auth-extra-groups\" field to")
fs.BoolVar(&s.DeleteMigratedMachineClass, "delete-migrated-machine-class", false, "Deletes any (provider specific) machine class that has the machine.sapcloud.io/migrated annotation")

fs.BoolVar(&s.AutoscalerScaleDownAnnotationDuringRollout, "autoscaler-scaldown-annotation-during-rollout", true, "Add cluster autoscaler scale-down disabled annotation during roll-out.")
hardikdr marked this conversation as resolved.
Show resolved Hide resolved

leaderelectionconfig.BindFlags(&s.LeaderElection, fs)
// TODO: DefaultFeatureGate is global and it adds all k8s flags
// utilfeature.DefaultFeatureGate.AddFlag(fs)
Expand Down
59 changes: 31 additions & 28 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,32 +81,34 @@ func NewController(
nodeConditions string,
bootstrapTokenAuthExtraGroups string,
deleteMigratedMachineClass bool,
autoscalerScaleDownAnnotationDuringRollout bool,
) (Controller, error) {
controller := &controller{
namespace: namespace,
controlMachineClient: controlMachineClient,
controlCoreClient: controlCoreClient,
targetCoreClient: targetCoreClient,
recorder: recorder,
expectations: NewUIDTrackingContExpectations(NewContExpectations()),
secretQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "secret"),
nodeQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node"),
openStackMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "openstackmachineclass"),
awsMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "awsmachineclass"),
azureMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "azuremachineclass"),
gcpMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "gcpmachineclass"),
alicloudMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "alicloudmachineclass"),
packetMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "packetmachineclass"),
machineQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machine"),
machineSetQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machineset"),
machineDeploymentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinedeployment"),
machineSafetyOrphanVMsQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyorphanvms"),
machineSafetyOvershootingQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyovershooting"),
machineSafetyAPIServerQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyapiserver"),
safetyOptions: safetyOptions,
nodeConditions: nodeConditions,
bootstrapTokenAuthExtraGroups: bootstrapTokenAuthExtraGroups,
deleteMigratedMachineClass: deleteMigratedMachineClass,
namespace: namespace,
controlMachineClient: controlMachineClient,
controlCoreClient: controlCoreClient,
targetCoreClient: targetCoreClient,
recorder: recorder,
expectations: NewUIDTrackingContExpectations(NewContExpectations()),
secretQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "secret"),
nodeQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node"),
openStackMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "openstackmachineclass"),
awsMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "awsmachineclass"),
azureMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "azuremachineclass"),
gcpMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "gcpmachineclass"),
alicloudMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "alicloudmachineclass"),
packetMachineClassQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "packetmachineclass"),
machineQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machine"),
machineSetQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machineset"),
machineDeploymentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinedeployment"),
machineSafetyOrphanVMsQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyorphanvms"),
machineSafetyOvershootingQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyovershooting"),
machineSafetyAPIServerQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "machinesafetyapiserver"),
safetyOptions: safetyOptions,
nodeConditions: nodeConditions,
bootstrapTokenAuthExtraGroups: bootstrapTokenAuthExtraGroups,
deleteMigratedMachineClass: deleteMigratedMachineClass,
autoscalerScaleDownAnnotationDuringRollout: autoscalerScaleDownAnnotationDuringRollout,
}

controller.internalExternalScheme = runtime.NewScheme()
Expand Down Expand Up @@ -398,10 +400,11 @@ type Controller interface {

// controller is a concrete Controller.
type controller struct {
namespace string
nodeConditions string
bootstrapTokenAuthExtraGroups string
deleteMigratedMachineClass bool
namespace string
nodeConditions string
bootstrapTokenAuthExtraGroups string
deleteMigratedMachineClass bool
autoscalerScaleDownAnnotationDuringRollout bool

controlMachineClient machineapi.MachineV1alpha1Interface
controlCoreClient kubernetes.Interface
Expand Down
128 changes: 128 additions & 0 deletions pkg/controller/controller_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ import (
"sync/atomic"
"time"

"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/validation"

"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
machineapi "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1"
annotationsutils "github.com/gardener/machine-controller-manager/pkg/util/annotations"
conditionutils "github.com/gardener/machine-controller-manager/pkg/util/conditions"
hashutil "github.com/gardener/machine-controller-manager/pkg/util/hash"
taintutils "github.com/gardener/machine-controller-manager/pkg/util/taints"
Expand Down Expand Up @@ -96,6 +98,13 @@ var Backoff = wait.Backoff{
Jitter: 1.0,
}

// UpdateAnnotationBackoff is the backoff period used while updating the annotation
var UpdateAnnotationBackoff = wait.Backoff{
Steps: 5,
Duration: 100 * time.Millisecond,
Jitter: 1.0,
}

var (
// KeyFunc is the variable that stores the function that retreives the object key from an object
KeyFunc = cache.DeletionHandlingMetaNamespaceKeyFunc
Expand Down Expand Up @@ -1111,3 +1120,122 @@ func ComputeHash(template *v1alpha1.MachineTemplateSpec, collisionCount *int32)

return machineTemplateSpecHasher.Sum32()
}

// AddOrUpdateAnnotationOnNode add annotations to the node. If annotation was added into node, it'll issue API calls
// to update nodes; otherwise, no API calls. Return error if any.
func AddOrUpdateAnnotationOnNode(c clientset.Interface, nodeName string, annotations map[string]string) error {
if annotations == nil {
return nil
}
firstTry := true
return clientretry.RetryOnConflict(UpdateAnnotationBackoff, func() error {
var err error
var oldNode *v1.Node
// First we try getting node from the API server cache, as it's cheaper. If it fails
// we get it from etcd to be sure to have fresh data.
if firstTry {
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{ResourceVersion: "0"})
firstTry = false
} else {
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
}
if errors.IsNotFound(err) {
klog.Warningf("Node %s not found while updating annotation. Err: %v", nodeName, err)
return nil
}
if err != nil {
return err
}

var newNode *v1.Node
updated := false

newNode, updated, err = annotationsutils.AddOrUpdateAnnotation(oldNode, annotations)

if !updated {
return nil
}
return UpdateNodeAnnotations(c, nodeName, oldNode, newNode)
})
}

// UpdateNodeAnnotations is for updating the node annotations from oldNode to the newNode
// using the nodes Update() method
func UpdateNodeAnnotations(c clientset.Interface, nodeName string, oldNode *v1.Node, newNode *v1.Node) error {
newNodeClone := oldNode.DeepCopy()
newNodeClone.Annotations = newNode.Annotations

_, err := c.CoreV1().Nodes().Update(newNodeClone)
if err != nil {
return fmt.Errorf("failed to create or update annotations for node %q: %v", nodeName, err)
}

return err
}

// RemoveAnnotationsOffNode is for cleaning up annotations temporarily added to node,
// won't fail if target annotation doesn't exist or has been removed.
// If passed a node it'll check if there's anything to be done, if annotation is not present it won't issue
// any API calls.
func RemoveAnnotationsOffNode(c clientset.Interface, nodeName string, annotations map[string]string) error {

// Short circuit if annotation doesnt exist for limiting API calls.
if annotations == nil || nodeName == "" {
return nil
}

firstTry := true
return clientretry.RetryOnConflict(UpdateAnnotationBackoff, func() error {
var err error
var oldNode *v1.Node
// First we try getting node from the API server cache, as it's cheaper. If it fails
// we get it from etcd to be sure to have fresh data.
if firstTry {
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{ResourceVersion: "0"})
firstTry = false
} else {
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
}
if errors.IsNotFound(err) {
klog.Warningf("Node %s not found while removing annotation. Err: %v", nodeName, err)
return nil
}

if err != nil {
return err
}

var newNode *v1.Node
oldNodeCopy := oldNode
updated := false

// Remove the annotations from the node.
newNode, updated, err = annotationsutils.RemoveAnnotation(oldNodeCopy, annotations)

if !updated {
return nil
}
return UpdateNodeAnnotations(c, nodeName, oldNode, newNode)
})
}

// GetAnnotationsFromNode returns all the annotations of the provided node.
func GetAnnotationsFromNode(c clientset.Interface, nodeName string) (map[string]string, error) {

// Short circuit if annotation doesnt exist for limiting API calls.
if nodeName == "" {
return nil, nil
}

node, err := c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
if errors.IsNotFound(err) {
klog.Warningf("Node %s not found while fetching annotation. Err: %v", nodeName, err)
return nil, nil
}

if err != nil {
return nil, err
}

return node.Annotations, nil
}
Loading