Skip to content

Commit

Permalink
feat: Implement preferredDuringSchedulingIgnoredDuringExecution for R…
Browse files Browse the repository at this point in the history
…emovePodsViolatingNodeAffinity

Now, the descheduler can detect and evict pods that are not optimally
allocated according to the "preferred..." node affinity. It only evicts
a pod if it can be scheduled on a node that scores higher in terms of
preferred node affinity than the current one.

This can be activated by enabling the RemovePodsViolatingNodeAffinity
plugin and passing "preferredDuringSchedulingIgnoredDuringExecution" in
the args.

For example, imagine we have a pod that prefers nodes with label "key1:
value1" with a weight of 10. If this pod is scheduled on a node that
doesn't have "key1: value1" as label but there's another node that has
this label and where this pod can potentially run, then the descheduler
will evict the pod.

Another effect of this commit is that the
RemovePodsViolatingNodeAffinity plugin will not remove pods that don't
fit in the current node but for other reasons than violating the node
affinity. Before that, enabling this plugin could cause evictions on
pods that were running on tainted nodes without the necessary
tolerations.

This commit also fixes the wording of some tests from
node_affinity_test.go and some parameters and expectations of these
tests, which were wrong.
  • Loading branch information
jordipiqueselles committed Jul 30, 2023
1 parent 1be0ab2 commit cd4ab57
Show file tree
Hide file tree
Showing 7 changed files with 473 additions and 57 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,10 @@ profiles:
This strategy makes sure all pods violating
[node affinity](https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#node-affinity)
are eventually removed from nodes. Node affinity rules allow a pod to specify
`requiredDuringSchedulingIgnoredDuringExecution` type, which tells the scheduler
`requiredDuringSchedulingIgnoredDuringExecution` and/or
`preferredDuringSchedulingIgnoredDuringExecution`.

The `requiredDuringSchedulingIgnoredDuringExecution` type tells the scheduler
to respect node affinity when scheduling the pod but kubelet to ignore
in case node changes over time and no longer respects the affinity.
When enabled, the strategy serves as a temporary implementation
Expand All @@ -449,6 +452,14 @@ of scheduling. Over time nodeA stops to satisfy the rule. When the strategy gets
executed and there is another node available that satisfies the node affinity rule,
podA gets evicted from nodeA.

The `preferredDuringSchedulingIgnoredDuringExecution` type tells the scheduler
to respect node affinity when scheduling if that's possible. If not, the pod
gets scheduled anyway. It may happen that, over time, the state of the cluster
changes and now the pod can be scheduled on a node that actually fits its
preferred node affinity. When enabled, the strategy serves as a temporary
implementation of `preferredDuringSchedulingPreferredDuringExecution`, so the
pod will be evicted if it can be scheduled on a "better" node.

**Parameters:**

|Name|Type|
Expand Down
34 changes: 34 additions & 0 deletions pkg/descheduler/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,3 +289,37 @@ func IsBasicResource(name v1.ResourceName) bool {
return false
}
}

// Returns the weight that the pod gives to a node by analyzing the
// soft node affinity of that pod
// (nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution)
func PodNodeAffinityWeight(pod *v1.Pod, node *v1.Node) int32 {
totalWeight, err := utils.PodNodeAffinityWeight(pod, node)
if err != nil {
return 0
}
return totalWeight
}

// Returns the best weight (maximum one) that the pod gives to the
// best node by analyzing the soft node affinity of that pod
// (nodeAffinity.preferredDuringSchedulingIgnoredDuringExecution)
func BestPodNodeAffinityWeight(pod *v1.Pod, nodes []*v1.Node) int32 {
var bestWeight int32 = 0
for _, node := range nodes {
weight := PodNodeAffinityWeight(pod, node)
if weight > bestWeight {
bestWeight = weight
}
}
return bestWeight
}

// PodMatchNodeSelector checks if a pod node selector matches the node label.
func PodMatchNodeSelector(pod *v1.Pod, node *v1.Node) bool {
matches, err := utils.PodMatchNodeSelector(pod, node)
if err != nil {
return false
}
return matches
}
72 changes: 72 additions & 0 deletions pkg/descheduler/node/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,78 @@ func TestNodeFit(t *testing.T) {
}
}

func TestBestPodNodeAffinityWeight(t *testing.T) {
defaultPod := test.BuildTestPod("p1", 0, 0, "node1", func(p *v1.Pod) {
p.Spec.Affinity = &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
PreferredDuringSchedulingIgnoredDuringExecution: []v1.PreferredSchedulingTerm{
{
Weight: 10,
Preference: v1.NodeSelectorTerm{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: "key1",
Operator: "In",
Values: []string{"value1"},
},
},
},
},
},
},
}
})
tests := []struct {
description string
pod *v1.Pod
nodes []*v1.Node
expectedWeight int32
}{
{
description: "No node matches the preferred affinity",
pod: defaultPod,
nodes: []*v1.Node{
test.BuildTestNode("node2", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
"key2": "value2",
}
}),
test.BuildTestNode("node3", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
"key3": "value3",
}
}),
},
expectedWeight: 0,
},
{
description: "A single node matches the preferred affinity",
pod: defaultPod,
nodes: []*v1.Node{
test.BuildTestNode("node1", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
"key1": "value1",
}
}),
test.BuildTestNode("node2", 64000, 128*1000*1000*1000, 200, func(node *v1.Node) {
node.ObjectMeta.Labels = map[string]string{
"key2": "value2",
}
}),
},
expectedWeight: 10,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
bestWeight := BestPodNodeAffinityWeight(tc.pod, tc.nodes)
if bestWeight != tc.expectedWeight {
t.Errorf("Test %#v failed", tc.description)
}
})
}
}

// createResourceList builds a small resource list of core resources
func createResourceList(cpu, memory, ephemeralStorage int64) v1.ResourceList {
resourceList := make(map[v1.ResourceName]resource.Quantity)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,40 +78,66 @@ func (d *RemovePodsViolatingNodeAffinity) Name() string {
func (d *RemovePodsViolatingNodeAffinity) Deschedule(ctx context.Context, nodes []*v1.Node) *frameworktypes.Status {
for _, nodeAffinity := range d.args.NodeAffinityType {
klog.V(2).InfoS("Executing for nodeAffinityType", "nodeAffinity", nodeAffinity)
var err *frameworktypes.Status = nil

// The pods that we'll evict must be evictable. For example, the current number of replicas
// must be greater than the pdb.minValue.
// The pods must be able to get scheduled on a different node. Otherwise, it doesn't make much
// sense to evict them.
switch nodeAffinity {
case "requiredDuringSchedulingIgnoredDuringExecution":
for _, node := range nodes {
klog.V(2).InfoS("Processing node", "node", klog.KObj(node))

pods, err := podutil.ListPodsOnANode(
node.Name,
d.handle.GetPodsAssignedToNodeFunc(),
podutil.WrapFilterFuncs(d.podFilter, func(pod *v1.Pod) bool {
return d.handle.Evictor().Filter(pod) &&
!nodeutil.PodFitsCurrentNode(d.handle.GetPodsAssignedToNodeFunc(), pod, node) &&
nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes)
}),
)
if err != nil {
return &frameworktypes.Status{
Err: fmt.Errorf("error listing pods on a node: %v", err),
}
}

for _, pod := range pods {
if pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil && pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
klog.V(1).InfoS("Evicting pod", "pod", klog.KObj(pod))
d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{})
if d.handle.Evictor().NodeLimitExceeded(node) {
break
}
}
}
// In this specific case, the pod must also violate the nodeSelector to be evicted
filterFunc := func(pod *v1.Pod, node *v1.Node, nodes []*v1.Node) bool {
return d.handle.Evictor().Filter(pod) &&
nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) &&
!nodeutil.PodMatchNodeSelector(pod, node)
}
err = d.processNodes(ctx, nodes, filterFunc)
case "preferredDuringSchedulingIgnoredDuringExecution":
// In this specific case, the pod must have a better fit on another node than
// in the current one based on the preferred node affinity
filterFunc := func(pod *v1.Pod, node *v1.Node, nodes []*v1.Node) bool {
return d.handle.Evictor().Filter(pod) &&
nodeutil.PodFitsAnyNode(d.handle.GetPodsAssignedToNodeFunc(), pod, nodes) &&
(nodeutil.BestPodNodeAffinityWeight(pod, nodes) > nodeutil.PodNodeAffinityWeight(pod, node))
}
err = d.processNodes(ctx, nodes, filterFunc)
default:
klog.ErrorS(nil, "Invalid nodeAffinityType", "nodeAffinity", nodeAffinity)
}

if err != nil {
return err
}
}
return nil
}

func (d *RemovePodsViolatingNodeAffinity) processNodes(ctx context.Context, nodes []*v1.Node, filterFunc func(*v1.Pod, *v1.Node, []*v1.Node) bool) *frameworktypes.Status {
for _, node := range nodes {
klog.V(2).InfoS("Processing node", "node", klog.KObj(node))

// Potentially evictable pods
pods, err := podutil.ListPodsOnANode(
node.Name,
d.handle.GetPodsAssignedToNodeFunc(),
podutil.WrapFilterFuncs(d.podFilter, func(pod *v1.Pod) bool {
return filterFunc(pod, node, nodes)
}),
)
if err != nil {
return &frameworktypes.Status{
Err: fmt.Errorf("error listing pods on a node: %v", err),
}
}

for _, pod := range pods {
klog.V(1).InfoS("Evicting pod", "pod", klog.KObj(pod))
d.handle.Evictor().Evict(ctx, pod, evictions.EvictOptions{})
if d.handle.Evictor().NodeLimitExceeded(node) {
break
}
}
}
return nil
}
Loading

0 comments on commit cd4ab57

Please sign in to comment.