From 24c0ca2ef95c0c35ed1a00d3946708b19e434a30 Mon Sep 17 00:00:00 2001 From: Amir Alavi Date: Thu, 13 May 2021 23:16:07 -0400 Subject: [PATCH] Take node's taints into consideration when balancing domains --- .../strategies/topologyspreadconstraint.go | 64 ++++++--- .../topologyspreadconstraint_test.go | 130 ++++++++++++++++++ 2 files changed, 176 insertions(+), 18 deletions(-) diff --git a/pkg/descheduler/strategies/topologyspreadconstraint.go b/pkg/descheduler/strategies/topologyspreadconstraint.go index fbd1bdb0be..b408ce16d6 100644 --- a/pkg/descheduler/strategies/topologyspreadconstraint.go +++ b/pkg/descheduler/strategies/topologyspreadconstraint.go @@ -19,6 +19,7 @@ package strategies import ( "context" "fmt" + utilerrors "k8s.io/apimachinery/pkg/util/errors" "math" "sort" @@ -283,20 +284,11 @@ func balanceDomains( // also (just for tracking), add them to the list of pods in the lower topology aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:] for k := range aboveToEvict { - // if the pod has a hard nodeAffinity or nodeSelector that only matches this node, - // don't bother evicting it as it will just end up back on the same node - // however we still account for it "being evicted" so the algorithm can complete - // TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod - // but still try to get the required # of movePods (instead of just chopping that value off the slice above) - isRequiredDuringSchedulingIgnoredDuringExecution := aboveToEvict[k].Spec.Affinity != nil && - aboveToEvict[k].Spec.Affinity.NodeAffinity != nil && - aboveToEvict[k].Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil - - if (aboveToEvict[k].Spec.NodeSelector != nil || isRequiredDuringSchedulingIgnoredDuringExecution) && - nodesPodFitsOnBesidesCurrent(aboveToEvict[k], nodeMap) == 0 { - klog.V(2).InfoS("Ignoring pod for eviction due to node selector/affinity", "pod", klog.KObj(aboveToEvict[k])) + if err := validatePodFitsOnOtherNodes(aboveToEvict[k], nodeMap); err != nil { + klog.V(2).InfoS(fmt.Sprintf("ignoring pod for eviction due to: %s", err.Error()), "pod", klog.KObj(aboveToEvict[k])) continue } + podsForEviction[aboveToEvict[k]] = struct{}{} } sortedDomains[j].pods = sortedDomains[j].pods[:len(sortedDomains[j].pods)-movePods] @@ -304,18 +296,54 @@ func balanceDomains( } } -// nodesPodFitsOnBesidesCurrent counts the number of nodes this pod could fit on based on its affinity +// validatePodFitsOnOtherNodes performs validation based on scheduling predicates for affinity and toleration. // It excludes the current node because, for the sake of domain balancing only, we care about if there is any other // place it could theoretically fit. // If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading -func nodesPodFitsOnBesidesCurrent(pod *v1.Pod, nodeMap map[string]*v1.Node) int { - count := 0 +func validatePodFitsOnOtherNodes(pod *v1.Pod, nodeMap map[string]*v1.Node) error { + // if the pod has a hard nodeAffinity/nodeSelector/toleration that only matches this node, + // don't bother evicting it as it will just end up back on the same node + // however we still account for it "being evicted" so the algorithm can complete + // TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod + // but still try to get the required # of movePods (instead of just chopping that value off the slice above) + isRequiredDuringSchedulingIgnoredDuringExecution := pod.Spec.Affinity != nil && + pod.Spec.Affinity.NodeAffinity != nil && + pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil + + hardTaintsFilter := func(taint *v1.Taint) bool { + return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute + } + + var eligibleNodesCount, ineligibleAffinityNodesCount, ineligibleTaintedNodesCount int for _, node := range nodeMap { - if nodeutil.PodFitsCurrentNode(pod, node) && node != nodeMap[pod.Spec.NodeName] { - count++ + if node == nodeMap[pod.Spec.NodeName] { + continue + } + if pod.Spec.NodeSelector != nil || isRequiredDuringSchedulingIgnoredDuringExecution { + if !nodeutil.PodFitsCurrentNode(pod, node) { + ineligibleAffinityNodesCount++ + continue + } + } + if !utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, hardTaintsFilter) { + ineligibleTaintedNodesCount++ + continue + } + + eligibleNodesCount++ + } + + if eligibleNodesCount == 0 { + var errs []error + if ineligibleAffinityNodesCount > 0 { + errs = append(errs, fmt.Errorf("%d nodes with ineligible selector/affinity", ineligibleAffinityNodesCount)) + } + if ineligibleTaintedNodesCount > 0 { + errs = append(errs, fmt.Errorf("%d nodes with taints that are not tolerated", ineligibleTaintedNodesCount)) } + return utilerrors.NewAggregate(errs) } - return count + return nil } // sortDomains sorts and splits the list of topology domains based on their size diff --git a/pkg/descheduler/strategies/topologyspreadconstraint_test.go b/pkg/descheduler/strategies/topologyspreadconstraint_test.go index 6b28a26929..5eb256c701 100644 --- a/pkg/descheduler/strategies/topologyspreadconstraint_test.go +++ b/pkg/descheduler/strategies/topologyspreadconstraint_test.go @@ -574,6 +574,134 @@ func TestTopologySpreadConstraint(t *testing.T) { strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{IncludeSoftConstraints: true}}, namespaces: []string{"ns1"}, }, + { + name: "2 domains, sizes [2,0], maxSkew=1, move 1 pod since pod tolerates the node with taint", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { + n.Labels["zone"] = "zoneB" + n.Spec.Taints = []v1.Taint{ + { + Key: "taint-test", + Value: "test", + Effect: v1.TaintEffectNoSchedule, + }, + } + }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + tolerations: []v1.Toleration{ + { + Key: "taint-test", + Value: "test", + Operator: v1.TolerationOpEqual, + Effect: v1.TaintEffectNoSchedule, + }, + }, + }, + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + nodeSelector: map[string]string{"zone": "zoneA"}, + }, + }), + expectedEvictedCount: 1, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + { + name: "2 domains, sizes [2,0], maxSkew=1, move 0 pods since pod does not tolerate the tainted node", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { + n.Labels["zone"] = "zoneB" + n.Spec.Taints = []v1.Taint{ + { + Key: "taint-test", + Value: "test", + Effect: v1.TaintEffectNoSchedule, + }, + } + }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + nodeSelector: map[string]string{"zone": "zoneA"}, + }, + }), + expectedEvictedCount: 0, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, + { + name: "2 domains, sizes [2,0], maxSkew=1, move 1 pod for node with PreferNoSchedule Taint", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { + n.Labels["zone"] = "zoneB" + n.Spec.Taints = []v1.Taint{ + { + Key: "taint-test", + Value: "test", + Effect: v1.TaintEffectPreferNoSchedule, + }, + } + }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + nodeSelector: map[string]string{"zone": "zoneA"}, + }, + }), + expectedEvictedCount: 1, + strategy: api.DeschedulerStrategy{}, + namespaces: []string{"ns1"}, + }, } for _, tc := range testCases { @@ -617,6 +745,7 @@ type testPodList struct { nodeSelector map[string]string nodeAffinity *v1.Affinity noOwners bool + tolerations []v1.Toleration } func createTestPods(testPods []testPodList) []*v1.Pod { @@ -636,6 +765,7 @@ func createTestPods(testPods []testPodList) []*v1.Pod { p.Spec.TopologySpreadConstraints = tp.constraints p.Spec.NodeSelector = tp.nodeSelector p.Spec.Affinity = tp.nodeAffinity + p.Spec.Tolerations = tp.tolerations })) podNum++ }