Skip to content

Commit

Permalink
Check whether pod matches the inter-pod anti-affinity of another Pod …
Browse files Browse the repository at this point in the history
…in a given Node in `NodeFit()` (#1356)

* Check if Pod matches inter-pod anti-affinity of other pod on node as part of NodeFit()

* Add unit tests for checking inter-pod anti-affinity match in NodeFit()
* Export setPodAntiAffinity() helper func to test utils

* Add docs for inter-pod anti-affinity in README

* Refactor logic for inter-pod anti-affinity to use in multiple pkgs
* Move logic for finding match between pods with antiaffinity out of framework to reuse in other pkgs
* Move interpod antiaffinity funcs to pkg/utils/predicates.go

* Add unit tests for inter-pod anti-affinity check
* Test logic in GroupByNodeName
* Test NodeFit() case where pods matches inter-pod anti-affinity
* Test for inter-pod anti-affinity pods  match terms, have label selector

* NodeFit inter-pod anti-affinity check returns early if affinity spec not set
  • Loading branch information
nikimanoledaki authored Mar 13, 2024
1 parent dc2cf72 commit 749e81c
Show file tree
Hide file tree
Showing 12 changed files with 328 additions and 132 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -881,6 +881,7 @@ profiles:
- `nodeAffinity` on the pod
- Resource `requests` made by the pod and the resources available on other nodes
- Whether any of the other nodes are marked as `unschedulable`
- Any `podAntiAffinity` between the pod and the pods on the other nodes

E.g.

Expand All @@ -902,7 +903,7 @@ profiles:
- "PodLifeTime"
```

Note that node fit filtering references the current pod spec, and not that of it's owner.
Note that node fit filtering references the current pod spec, and not that of its owner.
Thus, if the pod is owned by a ReplicationController (and that ReplicationController was modified recently),
the pod may be running with an outdated spec, which the descheduler will reference when determining node fit.
This is expected behavior as the descheduler is a "best-effort" mechanism.
Expand Down
26 changes: 26 additions & 0 deletions pkg/descheduler/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,13 @@ func NodeFit(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v
errors = append(errors, fmt.Errorf("node is not schedulable"))
}

// Check if pod matches inter-pod anti-affinity rule of pod on node
if match, err := podMatchesInterPodAntiAffinity(nodeIndexer, pod, node); err != nil {
errors = append(errors, err)
} else if match {
errors = append(errors, fmt.Errorf("pod matches inter-pod anti-affinity rule of other pod on node"))
}

return errors
}

Expand Down Expand Up @@ -323,3 +330,22 @@ func PodMatchNodeSelector(pod *v1.Pod, node *v1.Node) bool {
}
return matches
}

// podMatchesInterPodAntiAffinity checks if the pod matches the anti-affinity rule
// of another pod that is already on the given node.
// If a match is found, it returns true.
func podMatchesInterPodAntiAffinity(nodeIndexer podutil.GetPodsAssignedToNodeFunc, pod *v1.Pod, node *v1.Node) (bool, error) {
if pod.Spec.Affinity == nil || pod.Spec.Affinity.PodAntiAffinity == nil {
return false, nil
}

podsOnNode, err := podutil.ListPodsOnANode(node.Name, nodeIndexer, nil)
if err != nil {
return false, fmt.Errorf("error listing all pods: %v", err)
}

podsInANamespace := podutil.GroupByNamespace(podsOnNode)
nodeMap := utils.CreateNodeMap([]*v1.Node{node})

return utils.CheckPodsWithAntiAffinityExist(pod, podsInANamespace, nodeMap), nil
}
25 changes: 23 additions & 2 deletions pkg/descheduler/node/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,11 @@ func TestPodFitsAnyOtherNode(t *testing.T) {
}

func TestNodeFit(t *testing.T) {
node := test.BuildTestNode("node", 64000, 128*1000*1000*1000, 2, nil)
node := test.BuildTestNode("node", 64000, 128*1000*1000*1000, 2, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
"region": "main-region",
}
})
tests := []struct {
description string
pod *v1.Pod
Expand All @@ -781,6 +785,22 @@ func TestNodeFit(t *testing.T) {
},
err: errors.New("insufficient pods"),
},
{
description: "matches inter-pod anti-affinity rule of pod on node",
pod: test.PodWithPodAntiAffinity(test.BuildTestPod("p1", 1000, 1000, node.Name, nil), "foo", "bar"),
node: node,
podsOnNode: []*v1.Pod{
test.PodWithPodAntiAffinity(test.BuildTestPod("p2", 1000, 1000, node.Name, nil), "foo", "bar"),
},
err: errors.New("pod matches inter-pod anti-affinity rule of other pod on node"),
},
{
description: "pod fits on node",
pod: test.BuildTestPod("p1", 1000, 1000, "", func(pod *v1.Pod) {}),
node: node,
podsOnNode: []*v1.Pod{},
err: nil,
},
}

for _, tc := range tests {
Expand All @@ -804,7 +824,8 @@ func TestNodeFit(t *testing.T) {

sharedInformerFactory.Start(ctx.Done())
sharedInformerFactory.WaitForCacheSync(ctx.Done())
if errs := NodeFit(getPodsAssignedToNode, tc.pod, tc.node); (len(errs) == 0 && tc.err != nil) || errs[0].Error() != tc.err.Error() {
errs := NodeFit(getPodsAssignedToNode, tc.pod, tc.node)
if (len(errs) == 0 && tc.err != nil) || (len(errs) > 0 && errs[0].Error() != tc.err.Error()) {
t.Errorf("Test %#v failed, got %v, expect %v", tc.description, errs, tc.err)
}
})
Expand Down
9 changes: 9 additions & 0 deletions pkg/descheduler/pod/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,12 @@ func SortPodsBasedOnAge(pods []*v1.Pod) {
return pods[i].CreationTimestamp.Before(&pods[j].CreationTimestamp)
})
}

func GroupByNodeName(pods []*v1.Pod) map[string][]*v1.Pod {
m := make(map[string][]*v1.Pod)
for i := 0; i < len(pods); i++ {
pod := pods[i]
m[pod.Spec.NodeName] = append(m[pod.Spec.NodeName], pod)
}
return m
}
64 changes: 64 additions & 0 deletions pkg/descheduler/pod/pods_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,67 @@ func TestSortPodsBasedOnAge(t *testing.T) {
}
}
}

func TestGroupByNodeName(t *testing.T) {
tests := []struct {
name string
pods []*v1.Pod
expMap map[string][]*v1.Pod
}{
{
name: "list of pods is empty",
pods: []*v1.Pod{},
expMap: map[string][]*v1.Pod{},
},
{
name: "pods are on same node",
pods: []*v1.Pod{
{Spec: v1.PodSpec{
NodeName: "node1",
}},
{Spec: v1.PodSpec{
NodeName: "node1",
}},
},
expMap: map[string][]*v1.Pod{"node1": {
{Spec: v1.PodSpec{
NodeName: "node1",
}},
{Spec: v1.PodSpec{
NodeName: "node1",
}},
}},
},
{
name: "pods are on different nodes",
pods: []*v1.Pod{
{Spec: v1.PodSpec{
NodeName: "node1",
}},
{Spec: v1.PodSpec{
NodeName: "node2",
}},
},
expMap: map[string][]*v1.Pod{
"node1": {
{Spec: v1.PodSpec{
NodeName: "node1",
}},
},
"node2": {
{Spec: v1.PodSpec{
NodeName: "node2",
}},
},
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
resultMap := GroupByNodeName(test.pods)
if !reflect.DeepEqual(resultMap, test.expMap) {
t.Errorf("Expected %v node map, got %v", test.expMap, resultMap)
}
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
"sigs.k8s.io/descheduler/pkg/utils"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/klog/v2"
)

Expand Down Expand Up @@ -86,8 +85,8 @@ func (d *RemovePodsViolatingInterPodAntiAffinity) Deschedule(ctx context.Context
}

podsInANamespace := podutil.GroupByNamespace(pods)
podsOnANode := groupByNodeName(pods)
nodeMap := createNodeMap(nodes)
podsOnANode := podutil.GroupByNodeName(pods)
nodeMap := utils.CreateNodeMap(nodes)

loop:
for _, node := range nodes {
Expand All @@ -97,15 +96,17 @@ loop:
podutil.SortPodsBasedOnPriorityLowToHigh(pods)
totalPods := len(pods)
for i := 0; i < totalPods; i++ {
if checkPodsWithAntiAffinityExist(pods[i], podsInANamespace, nodeMap) && d.handle.Evictor().Filter(pods[i]) && d.handle.Evictor().PreEvictionFilter(pods[i]) {
if d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{StrategyName: PluginName}) {
// Since the current pod is evicted all other pods which have anti-affinity with this
// pod need not be evicted.
// Update allPods.
podsInANamespace = removePodFromNamespaceMap(pods[i], podsInANamespace)
pods = append(pods[:i], pods[i+1:]...)
i--
totalPods--
if utils.CheckPodsWithAntiAffinityExist(pods[i], podsInANamespace, nodeMap) {
if d.handle.Evictor().Filter(pods[i]) && d.handle.Evictor().PreEvictionFilter(pods[i]) {
if d.handle.Evictor().Evict(ctx, pods[i], evictions.EvictOptions{StrategyName: PluginName}) {
// Since the current pod is evicted all other pods which have anti-affinity with this
// pod need not be evicted.
// Update allPods.
podsInANamespace = removePodFromNamespaceMap(pods[i], podsInANamespace)
pods = append(pods[:i], pods[i+1:]...)
i--
totalPods--
}
}
}
if d.handle.Evictor().NodeLimitExceeded(node) {
Expand All @@ -130,87 +131,3 @@ func removePodFromNamespaceMap(podToRemove *v1.Pod, podMap map[string][]*v1.Pod)
}
return podMap
}

func groupByNodeName(pods []*v1.Pod) map[string][]*v1.Pod {
m := make(map[string][]*v1.Pod)
for i := 0; i < len(pods); i++ {
pod := pods[i]
m[pod.Spec.NodeName] = append(m[pod.Spec.NodeName], pod)
}
return m
}

func createNodeMap(nodes []*v1.Node) map[string]*v1.Node {
m := make(map[string]*v1.Node, len(nodes))
for _, node := range nodes {
m[node.GetName()] = node
}
return m
}

// checkPodsWithAntiAffinityExist checks if there are other pods on the node that the current pod cannot tolerate.
func checkPodsWithAntiAffinityExist(pod *v1.Pod, pods map[string][]*v1.Pod, nodeMap map[string]*v1.Node) bool {
affinity := pod.Spec.Affinity
if affinity != nil && affinity.PodAntiAffinity != nil {
for _, term := range getPodAntiAffinityTerms(affinity.PodAntiAffinity) {
namespaces := utils.GetNamespacesFromPodAffinityTerm(pod, &term)
selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
if err != nil {
klog.ErrorS(err, "Unable to convert LabelSelector into Selector")
return false
}
for namespace := range namespaces {
for _, existingPod := range pods[namespace] {
if existingPod.Name != pod.Name && utils.PodMatchesTermsNamespaceAndSelector(existingPod, namespaces, selector) {
node, ok := nodeMap[pod.Spec.NodeName]
if !ok {
continue
}
nodeHavingExistingPod, ok := nodeMap[existingPod.Spec.NodeName]
if !ok {
continue
}
if hasSameLabelValue(node, nodeHavingExistingPod, term.TopologyKey) {
klog.V(1).InfoS("Found Pods violating PodAntiAffinity", "pod to evicted", klog.KObj(pod))
return true
}
}
}
}
}
}
return false
}

func hasSameLabelValue(node1, node2 *v1.Node, key string) bool {
if node1.Name == node2.Name {
return true
}
node1Labels := node1.Labels
if node1Labels == nil {
return false
}
node2Labels := node2.Labels
if node2Labels == nil {
return false
}
value1, ok := node1Labels[key]
if !ok {
return false
}
value2, ok := node2Labels[key]
if !ok {
return false
}
return value1 == value2
}

// getPodAntiAffinityTerms gets the antiaffinity terms for the given pod.
func getPodAntiAffinityTerms(podAntiAffinity *v1.PodAntiAffinity) (terms []v1.PodAffinityTerm) {
if podAntiAffinity != nil {
if len(podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
terms = podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
}
}
return terms
}
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,14 @@ func TestPodAntiAffinity(t *testing.T) {
test.SetNormalOwnerRef(p11)

// set pod anti affinity
setPodAntiAffinity(p1, "foo", "bar")
setPodAntiAffinity(p3, "foo", "bar")
setPodAntiAffinity(p4, "foo", "bar")
setPodAntiAffinity(p5, "foo1", "bar1")
setPodAntiAffinity(p6, "foo1", "bar1")
setPodAntiAffinity(p7, "foo", "bar")
setPodAntiAffinity(p9, "foo", "bar")
setPodAntiAffinity(p10, "foo", "bar")
test.SetPodAntiAffinity(p1, "foo", "bar")
test.SetPodAntiAffinity(p3, "foo", "bar")
test.SetPodAntiAffinity(p4, "foo", "bar")
test.SetPodAntiAffinity(p5, "foo1", "bar1")
test.SetPodAntiAffinity(p6, "foo1", "bar1")
test.SetPodAntiAffinity(p7, "foo", "bar")
test.SetPodAntiAffinity(p9, "foo", "bar")
test.SetPodAntiAffinity(p10, "foo", "bar")

// set pod priority
test.SetPodPriority(p5, 100)
Expand Down Expand Up @@ -284,24 +284,3 @@ func TestPodAntiAffinity(t *testing.T) {
})
}
}

func setPodAntiAffinity(inputPod *v1.Pod, labelKey, labelValue string) {
inputPod.Spec.Affinity = &v1.Affinity{
PodAntiAffinity: &v1.PodAntiAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
{
LabelSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: labelKey,
Operator: metav1.LabelSelectorOpIn,
Values: []string{labelValue},
},
},
},
TopologyKey: "region",
},
},
},
}
}
Loading

0 comments on commit 749e81c

Please sign in to comment.