From cd4ab57a096e6af70f3f8be0f2ee3bb02e73bc18 Mon Sep 17 00:00:00 2001 From: Jordi Pique Date: Sun, 30 Jul 2023 13:07:21 +0100 Subject: [PATCH] feat: Implement preferredDuringSchedulingIgnoredDuringExecution for RemovePodsViolatingNodeAffinity Now, the descheduler can detect and evict pods that are not optimally allocated according to the "preferred..." node affinity. It only evicts a pod if it can be scheduled on a node that scores higher in terms of preferred node affinity than the current one. This can be activated by enabling the RemovePodsViolatingNodeAffinity plugin and passing "preferredDuringSchedulingIgnoredDuringExecution" in the args. For example, imagine we have a pod that prefers nodes with label "key1: value1" with a weight of 10. If this pod is scheduled on a node that doesn't have "key1: value1" as label but there's another node that has this label and where this pod can potentially run, then the descheduler will evict the pod. Another effect of this commit is that the RemovePodsViolatingNodeAffinity plugin will not remove pods that don't fit in the current node but for other reasons than violating the node affinity. Before that, enabling this plugin could cause evictions on pods that were running on tainted nodes without the necessary tolerations. This commit also fixes the wording of some tests from node_affinity_test.go and some parameters and expectations of these tests, which were wrong. --- README.md | 13 +- pkg/descheduler/node/node.go | 34 +++ pkg/descheduler/node/node_test.go | 72 +++++++ .../node_affinity.go | 80 ++++--- .../node_affinity_test.go | 202 +++++++++++++++--- pkg/utils/predicates.go | 26 +++ pkg/utils/predicates_test.go | 103 +++++++++ 7 files changed, 473 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index c755604b3d..2f86dc8c77 100644 --- a/README.md +++ b/README.md @@ -436,7 +436,10 @@ profiles: This strategy makes sure all pods violating [node affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity) are eventually removed from nodes. Node affinity rules allow a pod to specify -`requiredDuringSchedulingIgnoredDuringExecution` type, which tells the scheduler +`requiredDuringSchedulingIgnoredDuringExecution` and/or +`preferredDuringSchedulingIgnoredDuringExecution`. + +The `requiredDuringSchedulingIgnoredDuringExecution` type tells the scheduler to respect node affinity when scheduling the pod but kubelet to ignore in case node changes over time and no longer respects the affinity. When enabled, the strategy serves as a temporary implementation @@ -449,6 +452,14 @@ of scheduling. Over time nodeA stops to satisfy the rule. When the strategy gets executed and there is another node available that satisfies the node affinity rule, podA gets evicted from nodeA. +The `preferredDuringSchedulingIgnoredDuringExecution` type tells the scheduler +to respect node affinity when scheduling if that's possible. If not, the pod +gets scheduled anyway. It may happen that, over time, the state of the cluster +changes and now the pod can be scheduled on a node that actually fits its +preferred node affinity. When enabled, the strategy serves as a temporary +implementation of `preferredDuringSchedulingPreferredDuringExecution`, so the +pod will be evicted if it can be scheduled on a "better" node. + **Parameters:** |Name|Type| diff --git a/pkg/descheduler/node/node.go b/pkg/descheduler/node/node.go index 4b8a472ac9..8bdc7a69ee 100644 --- a/pkg/descheduler/node/node.go +++ b/pkg/descheduler/node/node.go @@ -289,3 +289,37 @@ func IsBasicResource(name v1.ResourceName) bool { return false } } + +// Returns the weight that the pod gives to a node by analyzing the +// soft node affinity of that pod +// (nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution) +func PodNodeAffinityWeight(pod *v1.Pod, node *v1.Node) int32 { + totalWeight, err := utils.PodNodeAffinityWeight(pod, node) + if err != nil { + return 0 + } + return totalWeight +} + +// Returns the best weight (maximum one) that the pod gives to the +// best node by analyzing the soft node affinity of that pod +// (nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution) +func BestPodNodeAffinityWeight(pod *v1.Pod, nodes []*v1.Node) int32 { + var bestWeight int32 = 0 + for _, node := range nodes { + weight := PodNodeAffinityWeight(pod, node) + if weight > bestWeight { + bestWeight = weight + } + } + return bestWeight +} + +// PodMatchNodeSelector checks if a pod node selector matches the node label. +func PodMatchNodeSelector(pod *v1.Pod, node *v1.Node) bool { + matches, err := utils.PodMatchNodeSelector(pod, node) + if err != nil { + return false + } + return matches +} diff --git a/pkg/descheduler/node/node_test.go b/pkg/descheduler/node/node_test.go index 18d4ec284a..7ef7356550 100644 --- a/pkg/descheduler/node/node_test.go +++ b/pkg/descheduler/node/node_test.go @@ -811,6 +811,78 @@ func TestNodeFit(t *testing.T) { } } +func TestBestPodNodeAffinityWeight(t *testing.T) { + defaultPod := test.BuildTestPod("p1", 0, 0, "node1", func(p *v1.Pod) { + p.Spec.Affinity = &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ + { + Weight: 10, + Preference: v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: "key1", + Operator: "In", + Values: []string{"value1"}, + }, + }, + }, + }, + }, + }, + } + }) + tests := []struct { + description string + pod *v1.Pod + nodes []*v1.Node + expectedWeight int32 + }{ + { + description: "No node matches the preferred affinity", + pod: defaultPod, + nodes: []*v1.Node{ + test.BuildTestNode("node2", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + "key2": "value2", + } + }), + test.BuildTestNode("node3", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + "key3": "value3", + } + }), + }, + expectedWeight: 0, + }, + { + description: "A single node matches the preferred affinity", + pod: defaultPod, + nodes: []*v1.Node{ + test.BuildTestNode("node1", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + "key1": "value1", + } + }), + test.BuildTestNode("node2", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + "key2": "value2", + } + }), + }, + expectedWeight: 10, + }, + } + for _, tc := range tests { + t.Run(tc.description, func(t *testing.T) { + bestWeight := BestPodNodeAffinityWeight(tc.pod, tc.nodes) + if bestWeight != tc.expectedWeight { + t.Errorf("Test %#v failed", tc.description) + } + }) + } +} + // createResourceList builds a small resource list of core resources func createResourceList(cpu, memory, ephemeralStorage int64) v1.ResourceList { resourceList := make(map[v1.ResourceName]resource.Quantity) diff --git a/pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity.go b/pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity.go index f2fadbb394..3538ea5a23 100644 --- a/pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity.go +++ b/pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity.go @@ -78,40 +78,66 @@ func (d *RemovePodsViolatingNodeAffinity) Name() string { func (d *RemovePodsViolatingNodeAffinity) Deschedule(ctx context.Context, nodes []*v1.Node) *frameworktypes.Status { for _, nodeAffinity := range d.args.NodeAffinityType { klog.V(2).InfoS("Executing for nodeAffinityType", "nodeAffinity", nodeAffinity) + var err *frameworktypes.Status = nil + // The pods that we'll evict must be evictable. For example, the current number of replicas + // must be greater than the pdb.minValue. + // The pods must be able to get scheduled on a different node. Otherwise, it doesn't make much + // sense to evict them. switch nodeAffinity { case "requiredDuringSchedulingIgnoredDuringExecution": - for _, node := range nodes { - klog.V(2).InfoS("Processing node", "node", klog.KObj(node)) - - pods, err := podutil.ListPodsOnANode( - node.Name, - d.handle.GetPodsAssignedToNodeFunc(), - podutil.WrapFilterFuncs(d.podFilter, func(pod *v1.Pod) bool { - return d.handle.Evictor().Filter(pod) && - !nodeutil.PodFitsCurrentNode(d.handle.GetPodsAssignedToNodeFunc(), pod, node) && - nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) - }), - ) - if err != nil { - return &frameworktypes.Status{ - Err: fmt.Errorf("error listing pods on a node: %v", err), - } - } - - for _, pod := range pods { - if pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil && pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { - klog.V(1).InfoS("Evicting pod", "pod", klog.KObj(pod)) - d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{}) - if d.handle.Evictor().NodeLimitExceeded(node) { - break - } - } - } + // In this specific case, the pod must also violate the nodeSelector to be evicted + filterFunc := func(pod *v1.Pod, node *v1.Node, nodes []*v1.Node) bool { + return d.handle.Evictor().Filter(pod) && + nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) && + !nodeutil.PodMatchNodeSelector(pod, node) } + err = d.processNodes(ctx, nodes, filterFunc) + case "preferredDuringSchedulingIgnoredDuringExecution": + // In this specific case, the pod must have a better fit on another node than + // in the current one based on the preferred node affinity + filterFunc := func(pod *v1.Pod, node *v1.Node, nodes []*v1.Node) bool { + return d.handle.Evictor().Filter(pod) && + nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) && + (nodeutil.BestPodNodeAffinityWeight(pod, nodes) > nodeutil.PodNodeAffinityWeight(pod, node)) + } + err = d.processNodes(ctx, nodes, filterFunc) default: klog.ErrorS(nil, "Invalid nodeAffinityType", "nodeAffinity", nodeAffinity) } + + if err != nil { + return err + } + } + return nil +} + +func (d *RemovePodsViolatingNodeAffinity) processNodes(ctx context.Context, nodes []*v1.Node, filterFunc func(*v1.Pod, *v1.Node, []*v1.Node) bool) *frameworktypes.Status { + for _, node := range nodes { + klog.V(2).InfoS("Processing node", "node", klog.KObj(node)) + + // Potentially evictable pods + pods, err := podutil.ListPodsOnANode( + node.Name, + d.handle.GetPodsAssignedToNodeFunc(), + podutil.WrapFilterFuncs(d.podFilter, func(pod *v1.Pod) bool { + return filterFunc(pod, node, nodes) + }), + ) + if err != nil { + return &frameworktypes.Status{ + Err: fmt.Errorf("error listing pods on a node: %v", err), + } + } + + for _, pod := range pods { + klog.V(1).InfoS("Evicting pod", "pod", klog.KObj(pod)) + d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{}) + if d.handle.Evictor().NodeLimitExceeded(node) { + break + } + } } return nil } diff --git a/pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity_test.go b/pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity_test.go index d2e2c9f4ac..9cf0cdd2a2 100644 --- a/pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity_test.go +++ b/pkg/framework/plugins/removepodsviolatingnodeaffinity/node_affinity_test.go @@ -48,26 +48,49 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { unschedulableNodeWithLabels.Labels[nodeLabelKey] = nodeLabelValue unschedulableNodeWithLabels.Spec.Unschedulable = true - addPodsToNode := func(node *v1.Node, deletionTimestamp *metav1.Time) []*v1.Pod { + addPodsToNode := func(node *v1.Node, deletionTimestamp *metav1.Time, affinityType string) []*v1.Pod { podWithNodeAffinity := test.BuildTestPod("podWithNodeAffinity", 100, 0, node.Name, nil) podWithNodeAffinity.Spec.Affinity = &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ - NodeSelectorTerms: []v1.NodeSelectorTerm{ - { - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: nodeLabelKey, - Operator: "In", - Values: []string{ - nodeLabelValue, - }, + NodeAffinity: &v1.NodeAffinity{}, + } + + switch affinityType { + case "requiredDuringSchedulingIgnoredDuringExecution": + podWithNodeAffinity.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution = &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: nodeLabelKey, + Operator: "In", + Values: []string{ + nodeLabelValue, }, }, }, }, }, - }, + } + case "preferredDuringSchedulingIgnoredDuringExecution": + podWithNodeAffinity.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution = []v1.PreferredSchedulingTerm{ + { + Weight: 10, + Preference: v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: nodeLabelKey, + Operator: "In", + Values: []string{ + nodeLabelValue, + }, + }, + }, + }, + }, + } + case "requiredDuringSchedulingRequiredDuringExecution": + default: + t.Fatalf("Invalid affinity type %s", affinityType) } pod1 := test.BuildTestPod("pod1", 100, 0, node.Name, nil) @@ -77,6 +100,7 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { pod1.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod2.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + podWithNodeAffinity.DeletionTimestamp = deletionTimestamp pod1.DeletionTimestamp = deletionTimestamp pod2.DeletionTimestamp = deletionTimestamp @@ -87,6 +111,7 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { } } + var uint0 uint = 0 var uint1 uint = 1 tests := []struct { description string @@ -104,16 +129,25 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { NodeAffinityType: []string{"requiredDuringSchedulingRequiredDuringExecution"}, }, expectedEvictedPodCount: 0, - pods: addPodsToNode(nodeWithoutLabels, nil), + pods: addPodsToNode(nodeWithoutLabels, nil, "requiredDuringSchedulingRequiredDuringExecution"), nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, }, { - description: "Pod is correctly scheduled on node, no eviction expected", + description: "Pod is correctly scheduled on node, no eviction expected [required affinity]", args: RemovePodsViolatingNodeAffinityArgs{ NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, }, expectedEvictedPodCount: 0, - pods: addPodsToNode(nodeWithLabels, nil), + pods: addPodsToNode(nodeWithLabels, nil, "requiredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithLabels}, + }, + { + description: "Pod is correctly scheduled on node, no eviction expected [preferred affinity]", + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + expectedEvictedPodCount: 0, + pods: addPodsToNode(nodeWithLabels, nil, "preferredDuringSchedulingIgnoredDuringExecution"), nodes: []*v1.Node{nodeWithLabels}, }, { @@ -122,66 +156,176 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { args: RemovePodsViolatingNodeAffinityArgs{ NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, }, - pods: addPodsToNode(nodeWithoutLabels, nil), + pods: addPodsToNode(nodeWithoutLabels, nil, "requiredDuringSchedulingIgnoredDuringExecution"), nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, }, { - description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 1, should not be evicted", + description: "Pod is scheduled on node without matching labels, another schedulable node available with better fit, should be evicted", + expectedEvictedPodCount: 1, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, nil, "preferredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + }, + { + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 1, should be evicted [required affinity]", expectedEvictedPodCount: 1, args: RemovePodsViolatingNodeAffinityArgs{ NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, }, - pods: addPodsToNode(nodeWithoutLabels, nil), + pods: addPodsToNode(nodeWithoutLabels, nil, "requiredDuringSchedulingIgnoredDuringExecution"), nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, maxPodsToEvictPerNode: &uint1, }, { - description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 1, no pod evicted since pod terminting", + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 1, should be evicted [preferred affinity]", expectedEvictedPodCount: 1, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, nil, "preferredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + maxPodsToEvictPerNode: &uint1, + }, + { + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 0, should be not evicted [required affinity]", + expectedEvictedPodCount: 0, args: RemovePodsViolatingNodeAffinityArgs{ NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, }, - pods: addPodsToNode(nodeWithoutLabels, &metav1.Time{}), + pods: addPodsToNode(nodeWithoutLabels, nil, "requiredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + maxPodsToEvictPerNode: &uint0, + }, + { + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 0, should be not evicted [preferred affinity]", + expectedEvictedPodCount: 0, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, nil, "preferredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + maxPodsToEvictPerNode: &uint0, + }, + { + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 1, no pod evicted since pod terminating [required affinity]", + expectedEvictedPodCount: 0, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, &metav1.Time{}, "requiredDuringSchedulingIgnoredDuringExecution"), nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, maxPodsToEvictPerNode: &uint1, }, { - description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 1, should not be evicted", + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxPodsToEvictPerNode set to 1, no pod evicted since pod terminating [preferred affinity]", + expectedEvictedPodCount: 0, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, &metav1.Time{}, "preferredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + maxPodsToEvictPerNode: &uint1, + }, + { + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 1, should be evicted [required affinity]", expectedEvictedPodCount: 1, args: RemovePodsViolatingNodeAffinityArgs{ NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, }, - pods: addPodsToNode(nodeWithoutLabels, nil), + pods: addPodsToNode(nodeWithoutLabels, nil, "requiredDuringSchedulingIgnoredDuringExecution"), nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, maxNoOfPodsToEvictPerNamespace: &uint1, }, { - description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 1, no pod evicted since pod terminting", + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 1, should be evicted [preferred affinity]", expectedEvictedPodCount: 1, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, nil, "preferredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + maxNoOfPodsToEvictPerNamespace: &uint1, + }, + { + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 0, should not be evicted [required affinity]", + expectedEvictedPodCount: 0, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, nil, "requiredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + maxNoOfPodsToEvictPerNamespace: &uint0, + }, + { + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 0, should not be evicted [preferred affinity]", + expectedEvictedPodCount: 0, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, nil, "preferredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + maxNoOfPodsToEvictPerNamespace: &uint0, + }, + { + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 1, no pod evicted since pod terminting [required affinity]", + expectedEvictedPodCount: 0, args: RemovePodsViolatingNodeAffinityArgs{ NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, }, - pods: addPodsToNode(nodeWithoutLabels, &metav1.Time{}), + pods: addPodsToNode(nodeWithoutLabels, &metav1.Time{}, "requiredDuringSchedulingIgnoredDuringExecution"), nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, maxNoOfPodsToEvictPerNamespace: &uint1, }, { - description: "Pod is scheduled on node without matching labels, but no node where pod fits is available, should not evict", + description: "Pod is scheduled on node without matching labels, another schedulable node available, maxNoOfPodsToEvictPerNamespace set to 1, no pod evicted since pod terminting [preferred affinity]", + expectedEvictedPodCount: 0, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, &metav1.Time{}, "preferredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, nodeWithLabels}, + maxNoOfPodsToEvictPerNamespace: &uint1, + }, + { + description: "Pod is scheduled on node without matching labels, but no node where pod fits is available, should not evict [required affinity]", expectedEvictedPodCount: 0, args: RemovePodsViolatingNodeAffinityArgs{ NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, }, - pods: addPodsToNode(nodeWithoutLabels, nil), + pods: addPodsToNode(nodeWithoutLabels, nil, "requiredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithoutLabels, unschedulableNodeWithLabels}, + nodefit: true, + }, + { + description: "Pod is scheduled on node without matching labels, but no node where pod fits is available, should not evict [preferred affinity]", + expectedEvictedPodCount: 0, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, nil, "preferredDuringSchedulingIgnoredDuringExecution"), nodes: []*v1.Node{nodeWithoutLabels, unschedulableNodeWithLabels}, nodefit: true, }, { - description: "Pod is scheduled on node without matching labels, and node where pod fits is available, should evict", + description: "Pod is scheduled on node without matching labels, and unschedulable node where pod could fit is available, should not evict [required affinity]", expectedEvictedPodCount: 0, args: RemovePodsViolatingNodeAffinityArgs{ NodeAffinityType: []string{"requiredDuringSchedulingIgnoredDuringExecution"}, }, - pods: addPodsToNode(nodeWithoutLabels, nil), + pods: addPodsToNode(nodeWithoutLabels, nil, "requiredDuringSchedulingIgnoredDuringExecution"), + nodes: []*v1.Node{nodeWithLabels, unschedulableNodeWithLabels}, + maxPodsToEvictPerNode: &uint1, + nodefit: true, + }, + { + description: "Pod is scheduled on node without matching labels, and unschedulable node where pod could fit is available, should not evict [preferred affinity]", + expectedEvictedPodCount: 0, + args: RemovePodsViolatingNodeAffinityArgs{ + NodeAffinityType: []string{"preferredDuringSchedulingIgnoredDuringExecution"}, + }, + pods: addPodsToNode(nodeWithoutLabels, nil, "preferredDuringSchedulingIgnoredDuringExecution"), nodes: []*v1.Node{nodeWithLabels, unschedulableNodeWithLabels}, maxPodsToEvictPerNode: &uint1, nodefit: true, diff --git a/pkg/utils/predicates.go b/pkg/utils/predicates.go index bc7b59a2fa..37e4768321 100644 --- a/pkg/utils/predicates.go +++ b/pkg/utils/predicates.go @@ -275,3 +275,29 @@ func TolerationsEqual(t1, t2 []v1.Toleration) bool { } return true } + +// Returns the weight that the pod gives to a node by analyzing the +// soft node affinity of that pod +// (nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution) +func PodNodeAffinityWeight(pod *v1.Pod, node *v1.Node) (int32, error) { + if pod.Spec.Affinity == nil || + pod.Spec.Affinity.NodeAffinity == nil || + len(pod.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution) == 0 { + return 0, nil + } + // Iterate over each PreferredSchedulingTerm and check if it matches with the current node labels. + // If so, add the weight of the PreferredSchedulingTerm to the sum of weight. With that, we'll know + // the weight that the nodeAffinity from this pod gives to this node. + var sumWeights int32 = 0 + for _, prefSchedulTerm := range pod.Spec.Affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution { + preferredNodeSelector := &v1.NodeSelector{NodeSelectorTerms: []v1.NodeSelectorTerm{prefSchedulTerm.Preference}} + match, err := corev1.MatchNodeSelectorTerms(node, preferredNodeSelector) + if err != nil { + klog.ErrorS(err, "error parsing node selector", "selector", preferredNodeSelector) + } + if match { + sumWeights += prefSchedulTerm.Weight + } + } + return sumWeights, nil +} diff --git a/pkg/utils/predicates_test.go b/pkg/utils/predicates_test.go index defef78afa..f473d592fd 100644 --- a/pkg/utils/predicates_test.go +++ b/pkg/utils/predicates_test.go @@ -5,6 +5,7 @@ import ( "testing" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestUniqueSortTolerations(t *testing.T) { @@ -938,3 +939,105 @@ func TestNodeSelectorTermsEqual(t *testing.T) { }) } } + +func createNodeSelectorTerm(key, value string) v1.NodeSelectorTerm { + return v1.NodeSelectorTerm{ + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: key, + Operator: "In", + Values: []string{value}, + }, + }, + } +} + +func TestPodNodeAffinityWeight(t *testing.T) { + defaultNode := v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "key1": "value1", + "key2": "value2", + "key3": "value3", + }, + }, + } + tests := []struct { + name string + affinity *v1.Affinity + expectedWeight int32 + }{ + { + name: "No affinity", + affinity: nil, + expectedWeight: 0, + }, + { + name: "No node affinity", + affinity: &v1.Affinity{}, + expectedWeight: 0, + }, + { + name: "Empty preferred node affinity, but matching required node affinity", + affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + createNodeSelectorTerm("key1", "value1"), + }, + }, + }, + }, + expectedWeight: 0, + }, + { + name: "Matching single key in preferred node affinity", + affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ + { + Weight: 10, + Preference: createNodeSelectorTerm("key1", "value1"), + }, + { + Weight: 5, + Preference: createNodeSelectorTerm("key1", "valueX"), + }, + }, + }, + }, + expectedWeight: 10, + }, + { + name: "Matching two keys in preferred node affinity", + affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{ + { + Weight: 10, + Preference: createNodeSelectorTerm("key1", "value1"), + }, + { + Weight: 5, + Preference: createNodeSelectorTerm("key2", "value2"), + }, + }, + }, + }, + expectedWeight: 15, + }, + } + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + pod := v1.Pod{} + pod.Spec.Affinity = test.affinity + totalWeight, err := PodNodeAffinityWeight(&pod, &defaultNode) + if err != nil { + t.Error("Found non nil error") + } + if totalWeight != test.expectedWeight { + t.Errorf("Expected total weight is %v but actual total weight is %v", test.expectedWeight, totalWeight) + } + }) + } +}