From e0246f189b901ef0f318cc09a4b377c1edc6ef40 Mon Sep 17 00:00:00 2001 From: "Beata Lach (Skiba)" Date: Tue, 16 Jul 2024 10:18:44 +0000 Subject: [PATCH] Extract getting nodes to delete for atomic node groups Extract logic for overriding nodes to delete when deleting nodes from a ZeroOrMaxNodeScaling node group. Simplifies the code and removes code duplication. --- cluster-autoscaler/core/static_autoscaler.go | 72 ++++++++++---------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index 5a1245fcf89c..3e2b2c16eb7e 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -744,7 +744,7 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu csr *clusterstate.ClusterStateRegistry, currentTime time.Time, logRecorder *utils.LogEventRecorder) (bool, error) { nodeGroups := a.nodeGroupsById() - nodesToBeDeletedByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode) + nodesToDeleteByNodeGroupId := make(map[string][]clusterstate.UnregisteredNode) for _, unregisteredNode := range allUnregisteredNodes { nodeGroup, err := a.CloudProvider.NodeGroupForNode(unregisteredNode.Node) if err != nil { @@ -763,12 +763,12 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu if unregisteredNode.UnregisteredSince.Add(maxNodeProvisionTime).Before(currentTime) { klog.V(0).Infof("Marking unregistered node %v for removal", unregisteredNode.Node.Name) - nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], unregisteredNode) + nodesToDeleteByNodeGroupId[nodeGroup.Id()] = append(nodesToDeleteByNodeGroupId[nodeGroup.Id()], unregisteredNode) } } removedAny := false - for nodeGroupId, unregisteredNodesToDelete := range nodesToBeDeletedByNodeGroupId { + for nodeGroupId, unregisteredNodesToDelete := range nodesToDeleteByNodeGroupId { nodeGroup := nodeGroups[nodeGroupId] klog.V(0).Infof("Removing %v unregistered nodes for node group %v", len(unregisteredNodesToDelete), nodeGroupId) @@ -788,20 +788,13 @@ func (a *StaticAutoscaler) removeOldUnregisteredNodes(allUnregisteredNodes []clu } nodesToDelete := toNodes(unregisteredNodesToDelete) - opts, err := nodeGroup.GetOptions(a.NodeGroupDefaults) - if err != nil && err != cloudprovider.ErrNotImplemented { - klog.Warningf("Failed to get node group options for %s: %s", nodeGroupId, err) + isZeroOrMaxNodesScaling, nodesToDeleteOverride, err := zeroOrMaxNodesToDelete(a.NodeGroupDefaults, nodeGroup) + if err != nil { + klog.Warningf("Failed to remove unregistered nodes from node group %s: %v", nodeGroupId, err) continue } - // If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup - // should stick to the all-or-nothing principle. Deleting all nodes. - if opts != nil && opts.ZeroOrMaxNodeScaling { - instances, err := nodeGroup.Nodes() - if err != nil { - klog.Warningf("Failed to fill in unregistered nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroupId, err) - continue - } - nodesToDelete = instancesToFakeNodes(instances) + if isZeroOrMaxNodesScaling { + nodesToDelete = nodesToDeleteOverride } err = nodeGroup.DeleteNodes(nodesToDelete) @@ -836,35 +829,22 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodeGroups := a.nodeGroupsById() - nodesToBeDeletedByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors() + nodesToDeleteByNodeGroupId := a.clusterStateRegistry.GetCreatedNodesWithErrors() deletedAny := false - for nodeGroupId, nodesToBeDeleted := range nodesToBeDeletedByNodeGroupId { + for nodeGroupId, nodesToDelete := range nodesToDeleteByNodeGroupId { var err error - klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToBeDeleted), nodeGroupId) + klog.V(1).Infof("Deleting %v from %v node group because of create errors", len(nodesToDelete), nodeGroupId) nodeGroup := nodeGroups[nodeGroupId] if nodeGroup == nil { err = fmt.Errorf("node group %s not found", nodeGroupId) - } else { - var opts *config.NodeGroupAutoscalingOptions - opts, err = nodeGroup.GetOptions(a.NodeGroupDefaults) - if err != nil && err != cloudprovider.ErrNotImplemented { - klog.Warningf("Failed to get node group options for %s: %s", nodeGroupId, err) - continue - } - // If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup - // should stick to the all-or-nothing principle. Deleting all nodes. - if opts != nil && opts.ZeroOrMaxNodeScaling { - instances, err := nodeGroup.Nodes() - if err != nil { - klog.Warningf("Failed to fill in failed nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroupId, err) - continue - } - nodesToBeDeleted = instancesToFakeNodes(instances) + } else if isZeroOrMaxNodesScaling, nodesToDeleteOverride, err := zeroOrMaxNodesToDelete(a.NodeGroupDefaults, nodeGroup); err == nil { + if isZeroOrMaxNodesScaling { + nodesToDelete = nodesToDeleteOverride } - err = nodeGroup.DeleteNodes(nodesToBeDeleted) + err = nodeGroup.DeleteNodes(nodesToDelete) } if err != nil { @@ -878,6 +858,28 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { return deletedAny } +// zeroOrMaxNodesToDelete returns false if the node group is not a "ZeroOrMaxNodeScaling" scaling node group. +// Returns true for "ZeroOrMaxNodeScaling" scaling node group, together with an override list of nodes to delete. +// For a "ZeroOrMaxNodeScaling" node group node deletion is atomic and should delete all nodes. +// +// Ideally this would be handled by the node group itself, but with current setup, it would have to be implemented by every cloud provider. +func zeroOrMaxNodesToDelete(defaults config.NodeGroupAutoscalingOptions, nodeGroup cloudprovider.NodeGroup) (bool, []*apiv1.Node, error) { + opts, err := nodeGroup.GetOptions(defaults) + if err != nil && err != cloudprovider.ErrNotImplemented { + return false, []*apiv1.Node{}, fmt.Errorf("Failed to get node group options for %s: %s", nodeGroup.Id(), err) + } + // If a scale-up of "ZeroOrMaxNodeScaling" node group failed, the cleanup + // should stick to the all-or-nothing principle. Deleting all nodes. + if opts != nil && opts.ZeroOrMaxNodeScaling { + instances, err := nodeGroup.Nodes() + if err != nil { + return false, []*apiv1.Node{}, fmt.Errorf("Failed to fill in failed nodes from group %s based on ZeroOrMaxNodeScaling option: %s", nodeGroup.Id(), err) + } + return true, instancesToFakeNodes(instances), nil + } + return false, []*apiv1.Node{}, nil +} + // instancesToNodes returns a list of fake nodes with just names populated, // so that they can be passed as nodes to delete func instancesToFakeNodes(instances []cloudprovider.Instance) []*apiv1.Node {