Skip to content

Commit

Permalink
Merge pull request #567 from a7i/topology-taint-toleration
Browse files Browse the repository at this point in the history
RemovePodsViolatingTopologySpreadConstraint : Take node's taints into consideration when balancing domains
  • Loading branch information
k8s-ci-robot committed May 14, 2021
2 parents 9b26abd + 24c0ca2 commit a1709e9
Show file tree
Hide file tree
Showing 2 changed files with 176 additions and 18 deletions.
64 changes: 46 additions & 18 deletions pkg/descheduler/strategies/topologyspreadconstraint.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package strategies
import (
"context"
"fmt"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"math"
"sort"

Expand Down Expand Up @@ -283,39 +284,66 @@ func balanceDomains(
// also (just for tracking), add them to the list of pods in the lower topology
aboveToEvict := sortedDomains[j].pods[len(sortedDomains[j].pods)-movePods:]
for k := range aboveToEvict {
// if the pod has a hard nodeAffinity or nodeSelector that only matches this node,
// don't bother evicting it as it will just end up back on the same node
// however we still account for it "being evicted" so the algorithm can complete
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
// but still try to get the required # of movePods (instead of just chopping that value off the slice above)
isRequiredDuringSchedulingIgnoredDuringExecution := aboveToEvict[k].Spec.Affinity != nil &&
aboveToEvict[k].Spec.Affinity.NodeAffinity != nil &&
aboveToEvict[k].Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil

if (aboveToEvict[k].Spec.NodeSelector != nil || isRequiredDuringSchedulingIgnoredDuringExecution) &&
nodesPodFitsOnBesidesCurrent(aboveToEvict[k], nodeMap) == 0 {
klog.V(2).InfoS("Ignoring pod for eviction due to node selector/affinity", "pod", klog.KObj(aboveToEvict[k]))
if err := validatePodFitsOnOtherNodes(aboveToEvict[k], nodeMap); err != nil {
klog.V(2).InfoS(fmt.Sprintf("ignoring pod for eviction due to: %s", err.Error()), "pod", klog.KObj(aboveToEvict[k]))
continue
}

podsForEviction[aboveToEvict[k]] = struct{}{}
}
sortedDomains[j].pods = sortedDomains[j].pods[:len(sortedDomains[j].pods)-movePods]
sortedDomains[i].pods = append(sortedDomains[i].pods, aboveToEvict...)
}
}

// nodesPodFitsOnBesidesCurrent counts the number of nodes this pod could fit on based on its affinity
// validatePodFitsOnOtherNodes performs validation based on scheduling predicates for affinity and toleration.
// It excludes the current node because, for the sake of domain balancing only, we care about if there is any other
// place it could theoretically fit.
// If the pod doesn't fit on its current node, that is a job for RemovePodsViolatingNodeAffinity, and irrelevant to Topology Spreading
func nodesPodFitsOnBesidesCurrent(pod *v1.Pod, nodeMap map[string]*v1.Node) int {
count := 0
func validatePodFitsOnOtherNodes(pod *v1.Pod, nodeMap map[string]*v1.Node) error {
// if the pod has a hard nodeAffinity/nodeSelector/toleration that only matches this node,
// don't bother evicting it as it will just end up back on the same node
// however we still account for it "being evicted" so the algorithm can complete
// TODO(@damemi): Since we don't order pods wrt their affinities, we should refactor this to skip the current pod
// but still try to get the required # of movePods (instead of just chopping that value off the slice above)
isRequiredDuringSchedulingIgnoredDuringExecution := pod.Spec.Affinity != nil &&
pod.Spec.Affinity.NodeAffinity != nil &&
pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil

hardTaintsFilter := func(taint *v1.Taint) bool {
return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute
}

var eligibleNodesCount, ineligibleAffinityNodesCount, ineligibleTaintedNodesCount int
for _, node := range nodeMap {
if nodeutil.PodFitsCurrentNode(pod, node) && node != nodeMap[pod.Spec.NodeName] {
count++
if node == nodeMap[pod.Spec.NodeName] {
continue
}
if pod.Spec.NodeSelector != nil || isRequiredDuringSchedulingIgnoredDuringExecution {
if !nodeutil.PodFitsCurrentNode(pod, node) {
ineligibleAffinityNodesCount++
continue
}
}
if !utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, hardTaintsFilter) {
ineligibleTaintedNodesCount++
continue
}

eligibleNodesCount++
}

if eligibleNodesCount == 0 {
var errs []error
if ineligibleAffinityNodesCount > 0 {
errs = append(errs, fmt.Errorf("%d nodes with ineligible selector/affinity", ineligibleAffinityNodesCount))
}
if ineligibleTaintedNodesCount > 0 {
errs = append(errs, fmt.Errorf("%d nodes with taints that are not tolerated", ineligibleTaintedNodesCount))
}
return utilerrors.NewAggregate(errs)
}
return count
return nil
}

// sortDomains sorts and splits the list of topology domains based on their size
Expand Down
130 changes: 130 additions & 0 deletions pkg/descheduler/strategies/topologyspreadconstraint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,134 @@ func TestTopologySpreadConstraint(t *testing.T) {
strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{IncludeSoftConstraints: true}},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 1 pod since pod tolerates the node with taint",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) {
n.Labels["zone"] = "zoneB"
n.Spec.Taints = []v1.Taint{
{
Key: "taint-test",
Value: "test",
Effect: v1.TaintEffectNoSchedule,
},
}
}),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
tolerations: []v1.Toleration{
{
Key: "taint-test",
Value: "test",
Operator: v1.TolerationOpEqual,
Effect: v1.TaintEffectNoSchedule,
},
},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeSelector: map[string]string{"zone": "zoneA"},
},
}),
expectedEvictedCount: 1,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 0 pods since pod does not tolerate the tainted node",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) {
n.Labels["zone"] = "zoneB"
n.Spec.Taints = []v1.Taint{
{
Key: "taint-test",
Value: "test",
Effect: v1.TaintEffectNoSchedule,
},
}
}),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeSelector: map[string]string{"zone": "zoneA"},
},
}),
expectedEvictedCount: 0,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
{
name: "2 domains, sizes [2,0], maxSkew=1, move 1 pod for node with PreferNoSchedule Taint",
nodes: []*v1.Node{
test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneA" }),
test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) {
n.Labels["zone"] = "zoneB"
n.Spec.Taints = []v1.Taint{
{
Key: "taint-test",
Value: "test",
Effect: v1.TaintEffectPreferNoSchedule,
},
}
}),
},
pods: createTestPods([]testPodList{
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
constraints: []v1.TopologySpreadConstraint{
{
MaxSkew: 1,
TopologyKey: "zone",
WhenUnsatisfiable: v1.DoNotSchedule,
LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}},
},
},
},
{
count: 1,
node: "n1",
labels: map[string]string{"foo": "bar"},
nodeSelector: map[string]string{"zone": "zoneA"},
},
}),
expectedEvictedCount: 1,
strategy: api.DeschedulerStrategy{},
namespaces: []string{"ns1"},
},
}

for _, tc := range testCases {
Expand Down Expand Up @@ -617,6 +745,7 @@ type testPodList struct {
nodeSelector map[string]string
nodeAffinity *v1.Affinity
noOwners bool
tolerations []v1.Toleration
}

func createTestPods(testPods []testPodList) []*v1.Pod {
Expand All @@ -636,6 +765,7 @@ func createTestPods(testPods []testPodList) []*v1.Pod {
p.Spec.TopologySpreadConstraints = tp.constraints
p.Spec.NodeSelector = tp.nodeSelector
p.Spec.Affinity = tp.nodeAffinity
p.Spec.Tolerations = tp.tolerations
}))
podNum++
}
Expand Down

0 comments on commit a1709e9

Please sign in to comment.