diff --git a/README.md b/README.md index c3a4e01694..87b88475fa 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Table of Contents - [Namespace filtering](#namespace-filtering) - [Priority filtering](#priority-filtering) - [Label filtering](#label-filtering) + - [Node Fit filtering](#node-fit-filtering) - [Pod Evictions](#pod-evictions) - [Pod Disruption Budget (PDB)](#pod-disruption-budget-pdb) - [Metrics](#metrics) @@ -157,6 +158,7 @@ should include `ReplicaSet` to have pods created by Deployments excluded. |`namespaces`|(see [namespace filtering](#namespace-filtering))| |`thresholdPriority`|int (see [priority filtering](#priority-filtering))| |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| +|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))| **Example:** ```yaml @@ -204,6 +206,7 @@ strategy evicts pods from `overutilized nodes` (those with usage above `targetTh |`numberOfNodes`|int| |`thresholdPriority`|int (see [priority filtering](#priority-filtering))| |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| +|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))| **Example:** @@ -253,6 +256,7 @@ node. |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| |`namespaces`|(see [namespace filtering](#namespace-filtering))| |`labelSelector`|(see [label filtering](#label-filtering))| +|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))| **Example:** @@ -291,6 +295,7 @@ podA gets evicted from nodeA. |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| |`namespaces`|(see [namespace filtering](#namespace-filtering))| |`labelSelector`|(see [label filtering](#label-filtering))| +|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))| **Example:** @@ -320,6 +325,7 @@ and will be evicted. |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| |`namespaces`|(see [namespace filtering](#namespace-filtering))| |`labelSelector`|(see [label filtering](#label-filtering))| +|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))| **Example:** @@ -348,6 +354,7 @@ include soft constraints. |`thresholdPriority`|int (see [priority filtering](#priority-filtering))| |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| |`namespaces`|(see [namespace filtering](#namespace-filtering))| +|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))| **Example:** @@ -379,6 +386,7 @@ which determines whether init container restarts should be factored into that ca |`thresholdPriority`|int (see [priority filtering](#priority-filtering))| |`thresholdPriorityClassName`|string (see [priority filtering](#priority-filtering))| |`namespaces`|(see [namespace filtering](#namespace-filtering))| +|`nodeFit`|bool (see [node fit filtering](#node-fit-filtering))| **Example:** @@ -551,6 +559,49 @@ strategies: - {key: environment, operator: NotIn, values: [dev]} ``` + +### Node Fit filtering + +The following strategies accept a `nodeFit` boolean parameter which can optimize descheduling: +* `RemoveDuplicates` +* `LowNodeUtilization` +* `RemovePodsViolatingInterPodAntiAffinity` +* `RemovePodsViolatingNodeAffinity` +* `RemovePodsViolatingNodeTaints` +* `RemovePodsViolatingTopologySpreadConstraint` +* `RemovePodsHavingTooManyRestarts` + + If set to `true` the descheduler will consider whether or not the pods that meet eviction criteria will fit on other nodes before evicting them. If a pod cannot be rescheduled to another node, it will not be evicted. Currently the following criteria are considered when setting `nodeFit` to `true`: +- A `nodeSelector` on the pod +- Any `Tolerations` on the pod and any `Taints` on the other nodes +- `nodeAffinity` on the pod +- Whether any of the other nodes are marked as `unschedulable` + +E.g. + +```yaml +apiVersion: "descheduler/v1alpha1" +kind: "DeschedulerPolicy" +strategies: + "LowNodeUtilization": + enabled: true + params: + nodeResourceUtilizationThresholds: + thresholds: + "cpu" : 20 + "memory": 20 + "pods": 20 + targetThresholds: + "cpu" : 50 + "memory": 50 + "pods": 50 + nodeFit: true +``` + +Note that node fit filtering references the current pod spec, and not that of it's owner. Thus, if the pod is owned by a ReplicationController (and that ReplicationController was modified recently), the pod may be running with an outdated spec, which the descheduler will reference when determining node fit. This is expected behavior as the descheduler is a "best-effort" mechanism. + +Using Deployments instead of ReplicationControllers provides an automated rollout of pod spec changes, therefore ensuring that the descheduler has an up-to-date view of the cluster state. + ## Pod Evictions When the descheduler decides to evict pods from a node, it employs the following general mechanism: diff --git a/pkg/api/types.go b/pkg/api/types.go index 458f54f989..afb2acbc39 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -80,6 +80,7 @@ type StrategyParameters struct { ThresholdPriority *int32 ThresholdPriorityClassName string LabelSelector *metav1.LabelSelector + NodeFit bool } type Percentage float64 diff --git a/pkg/api/v1alpha1/types.go b/pkg/api/v1alpha1/types.go index e67beff32b..f18a5ee2a2 100644 --- a/pkg/api/v1alpha1/types.go +++ b/pkg/api/v1alpha1/types.go @@ -78,6 +78,7 @@ type StrategyParameters struct { ThresholdPriority *int32 `json:"thresholdPriority"` ThresholdPriorityClassName string `json:"thresholdPriorityClassName"` LabelSelector *metav1.LabelSelector `json:"labelSelector"` + NodeFit bool `json:"nodeFit"` } type Percentage float64 diff --git a/pkg/api/v1alpha1/zz_generated.conversion.go b/pkg/api/v1alpha1/zz_generated.conversion.go index 90351de35f..1b91f8f503 100644 --- a/pkg/api/v1alpha1/zz_generated.conversion.go +++ b/pkg/api/v1alpha1/zz_generated.conversion.go @@ -294,6 +294,7 @@ func autoConvert_v1alpha1_StrategyParameters_To_api_StrategyParameters(in *Strat out.ThresholdPriority = (*int32)(unsafe.Pointer(in.ThresholdPriority)) out.ThresholdPriorityClassName = in.ThresholdPriorityClassName out.LabelSelector = (*v1.LabelSelector)(unsafe.Pointer(in.LabelSelector)) + out.NodeFit = in.NodeFit return nil } @@ -313,6 +314,7 @@ func autoConvert_api_StrategyParameters_To_v1alpha1_StrategyParameters(in *api.S out.ThresholdPriority = (*int32)(unsafe.Pointer(in.ThresholdPriority)) out.ThresholdPriorityClassName = in.ThresholdPriorityClassName out.LabelSelector = (*v1.LabelSelector)(unsafe.Pointer(in.LabelSelector)) + out.NodeFit = in.NodeFit return nil } diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go index 4ec78bd082..127041f700 100644 --- a/pkg/descheduler/evictions/evictions.go +++ b/pkg/descheduler/evictions/evictions.go @@ -32,6 +32,7 @@ import ( "k8s.io/client-go/tools/record" "k8s.io/klog/v2" "sigs.k8s.io/descheduler/metrics" + nodeutil "sigs.k8s.io/descheduler/pkg/descheduler/node" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" "sigs.k8s.io/descheduler/pkg/utils" @@ -47,6 +48,7 @@ type nodePodEvictedCount map[*v1.Node]int type PodEvictor struct { client clientset.Interface + nodes []*v1.Node policyGroupVersion string dryRun bool maxPodsToEvictPerNode int @@ -74,6 +76,7 @@ func NewPodEvictor( return &PodEvictor{ client: client, + nodes: nodes, policyGroupVersion: policyGroupVersion, dryRun: dryRun, maxPodsToEvictPerNode: maxPodsToEvictPerNode, @@ -164,6 +167,7 @@ func evictPod(ctx context.Context, client clientset.Interface, pod *v1.Pod, poli type Options struct { priority *int32 + nodeFit bool } // WithPriorityThreshold sets a threshold for pod's priority class. @@ -175,6 +179,16 @@ func WithPriorityThreshold(priority int32) func(opts *Options) { } } +// WithNodeFit sets whether or not to consider taints, node selectors, +// and pod affinity when evicting. A pod who's tolerations, node selectors, +// and affinity match a node other than the one it is currently running on +// is evictable. +func WithNodeFit(nodeFit bool) func(opts *Options) { + return func(opts *Options) { + opts.nodeFit = nodeFit + } +} + type constraint func(pod *v1.Pod) error type evictable struct { @@ -225,6 +239,14 @@ func (pe *PodEvictor) Evictable(opts ...func(opts *Options)) *evictable { return nil }) } + if options.nodeFit { + ev.constraints = append(ev.constraints, func(pod *v1.Pod) error { + if !nodeutil.PodFitsAnyOtherNode(pod, pe.nodes) { + return fmt.Errorf("pod does not fit on any other node because of nodeSelector(s), Taint(s), or nodes marked as unschedulable") + } + return nil + }) + } return ev } diff --git a/pkg/descheduler/evictions/evictions_test.go b/pkg/descheduler/evictions/evictions_test.go index fb4bf4d744..aff06803f3 100644 --- a/pkg/descheduler/evictions/evictions_test.go +++ b/pkg/descheduler/evictions/evictions_test.go @@ -73,19 +73,27 @@ func TestIsEvictable(t *testing.T) { n1 := test.BuildTestNode("node1", 1000, 2000, 13, nil) lowPriority := int32(800) highPriority := int32(900) + + nodeTaintKey := "hardware" + nodeTaintValue := "gpu" + + nodeLabelKey := "datacenter" + nodeLabelValue := "east" type testCase struct { pod *v1.Pod - runBefore func(*v1.Pod) + nodes []*v1.Node + runBefore func(*v1.Pod, []*v1.Node) evictLocalStoragePods bool evictSystemCriticalPods bool priorityThreshold *int32 + nodeFit bool result bool } testCases := []testCase{ { // Normal pod eviction with normal ownerRefs pod: test.BuildTestPod("p1", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() }, evictLocalStoragePods: false, @@ -93,7 +101,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Normal pod eviction with normal ownerRefs and descheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p2", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() }, @@ -102,7 +110,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Normal pod eviction with replicaSet ownerRefs pod: test.BuildTestPod("p3", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() }, evictLocalStoragePods: false, @@ -110,7 +118,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Normal pod eviction with replicaSet ownerRefs and descheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p4", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() }, @@ -119,7 +127,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Normal pod eviction with statefulSet ownerRefs pod: test.BuildTestPod("p18", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetStatefulSetOwnerRefList() }, evictLocalStoragePods: false, @@ -127,7 +135,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Normal pod eviction with statefulSet ownerRefs and descheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p19", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} pod.ObjectMeta.OwnerReferences = test.GetStatefulSetOwnerRefList() }, @@ -136,7 +144,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod not evicted because it is bound to a PV and evictLocalStoragePods = false pod: test.BuildTestPod("p5", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Spec.Volumes = []v1.Volume{ { @@ -154,7 +162,7 @@ func TestIsEvictable(t *testing.T) { result: false, }, { // Pod is evicted because it is bound to a PV and evictLocalStoragePods = true pod: test.BuildTestPod("p6", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Spec.Volumes = []v1.Volume{ { @@ -172,7 +180,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod is evicted because it is bound to a PV and evictLocalStoragePods = false, but it has scheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p7", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Spec.Volumes = []v1.Volume{ @@ -191,7 +199,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod not evicted becasuse it is part of a daemonSet pod: test.BuildTestPod("p8", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList() }, evictLocalStoragePods: false, @@ -199,7 +207,7 @@ func TestIsEvictable(t *testing.T) { result: false, }, { // Pod is evicted becasuse it is part of a daemonSet, but it has scheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p9", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} pod.ObjectMeta.OwnerReferences = test.GetDaemonSetOwnerRefList() }, @@ -208,7 +216,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod not evicted becasuse it is a mirror pod pod: test.BuildTestPod("p10", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Annotations = test.GetMirrorPodAnnotation() }, @@ -217,7 +225,7 @@ func TestIsEvictable(t *testing.T) { result: false, }, { // Pod is evicted becasuse it is a mirror pod, but it has scheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p11", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Annotations = test.GetMirrorPodAnnotation() pod.Annotations["descheduler.alpha.kubernetes.io/evict"] = "true" @@ -227,7 +235,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod not evicted becasuse it has system critical priority pod: test.BuildTestPod("p12", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() priority := utils.SystemCriticalPriority pod.Spec.Priority = &priority @@ -237,7 +245,7 @@ func TestIsEvictable(t *testing.T) { result: false, }, { // Pod is evicted becasuse it has system critical priority, but it has scheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p13", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() priority := utils.SystemCriticalPriority pod.Spec.Priority = &priority @@ -250,7 +258,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod not evicted becasuse it has a priority higher than the configured priority threshold pod: test.BuildTestPod("p14", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Spec.Priority = &highPriority }, @@ -260,7 +268,7 @@ func TestIsEvictable(t *testing.T) { result: false, }, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but it has scheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p15", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} pod.Spec.Priority = &highPriority @@ -271,7 +279,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod is evicted becasuse it has system critical priority, but evictSystemCriticalPods = true pod: test.BuildTestPod("p16", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() priority := utils.SystemCriticalPriority pod.Spec.Priority = &priority @@ -281,7 +289,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod is evicted becasuse it has system critical priority, but evictSystemCriticalPods = true and it has scheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p16", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} priority := utils.SystemCriticalPriority @@ -292,7 +300,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but evictSystemCriticalPods = true pod: test.BuildTestPod("p17", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Spec.Priority = &highPriority }, @@ -302,7 +310,7 @@ func TestIsEvictable(t *testing.T) { result: true, }, { // Pod is evicted becasuse it has a priority higher than the configured priority threshold, but evictSystemCriticalPods = true and it has scheduler.alpha.kubernetes.io/evict annotation pod: test.BuildTestPod("p17", 400, 0, n1.Name, nil), - runBefore: func(pod *v1.Pod) { + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() pod.Annotations = map[string]string{"descheduler.alpha.kubernetes.io/evict": "true"} pod.Spec.Priority = &highPriority @@ -311,21 +319,116 @@ func TestIsEvictable(t *testing.T) { evictSystemCriticalPods: true, priorityThreshold: &lowPriority, result: true, + }, { // Pod with no tolerations running on normal node, all other nodes tainted + pod: test.BuildTestPod("p1", 400, 0, n1.Name, nil), + nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)}, + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + + for _, node := range nodes { + node.Spec.Taints = []v1.Taint{ + { + Key: nodeTaintKey, + Value: nodeTaintValue, + Effect: v1.TaintEffectNoSchedule, + }, + } + } + }, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + nodeFit: true, + result: false, + }, { // Pod with correct tolerations running on normal node, all other nodes tainted + pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) { + pod.Spec.Tolerations = []v1.Toleration{ + { + Key: nodeTaintKey, + Value: nodeTaintValue, + Effect: v1.TaintEffectNoSchedule, + }, + } + }), + nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)}, + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + + for _, node := range nodes { + node.Spec.Taints = []v1.Taint{ + { + Key: nodeTaintKey, + Value: nodeTaintValue, + Effect: v1.TaintEffectNoSchedule, + }, + } + } + }, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + nodeFit: true, + result: true, + }, { // Pod with incorrect node selector + pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) { + pod.Spec.NodeSelector = map[string]string{ + nodeLabelKey: "fail", + } + }), + nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)}, + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + + for _, node := range nodes { + node.ObjectMeta.Labels = map[string]string{ + nodeLabelKey: nodeLabelValue, + } + } + }, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + nodeFit: true, + result: false, + }, { // Pod with correct node selector + pod: test.BuildTestPod("p1", 400, 0, n1.Name, func(pod *v1.Pod) { + pod.Spec.NodeSelector = map[string]string{ + nodeLabelKey: nodeLabelValue, + } + }), + nodes: []*v1.Node{test.BuildTestNode("node2", 1000, 2000, 13, nil), test.BuildTestNode("node3", 1000, 2000, 13, nil)}, + runBefore: func(pod *v1.Pod, nodes []*v1.Node) { + pod.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + + for _, node := range nodes { + node.ObjectMeta.Labels = map[string]string{ + nodeLabelKey: nodeLabelValue, + } + } + }, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + nodeFit: true, + result: true, }, } for _, test := range testCases { - test.runBefore(test.pod) + test.runBefore(test.pod, test.nodes) + nodes := append(test.nodes, n1) podEvictor := &PodEvictor{ evictLocalStoragePods: test.evictLocalStoragePods, evictSystemCriticalPods: test.evictSystemCriticalPods, + nodes: nodes, } evictable := podEvictor.Evictable() + var opts []func(opts *Options) if test.priorityThreshold != nil { - evictable = podEvictor.Evictable(WithPriorityThreshold(*test.priorityThreshold)) + opts = append(opts, WithPriorityThreshold(*test.priorityThreshold)) + } + if test.nodeFit { + opts = append(opts, WithNodeFit(true)) } + evictable = podEvictor.Evictable(opts...) result := evictable.IsEvictable(test.pod) if result != test.result { diff --git a/pkg/descheduler/node/node.go b/pkg/descheduler/node/node.go index 10c3b6c22c..1c21e060c2 100644 --- a/pkg/descheduler/node/node.go +++ b/pkg/descheduler/node/node.go @@ -96,7 +96,38 @@ func IsReady(node *v1.Node) bool { return true } -// IsNodeUnschedulable checks if the node is unschedulable. This is helper function to check only in case of +// PodFitsAnyOtherNode checks if the given pod fits any of the given nodes, besides the node +// the pod is already running on. The node fit is based on multiple criteria, like, pod node selector +// matching the node label (including affinity), the taints on the node, and the node being schedulable or not. +func PodFitsAnyOtherNode(pod *v1.Pod, nodes []*v1.Node) bool { + + for _, node := range nodes { + // Skip node pod is already on + if node.Name == pod.Spec.NodeName { + continue + } + // Check node selector and required affinity + ok, err := utils.PodMatchNodeSelector(pod, node) + if err != nil || !ok { + continue + } + // Check taints (we only care about NoSchedule and NoExecute taints) + ok = utils.TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, node.Spec.Taints, func(taint *v1.Taint) bool { + return taint.Effect == v1.TaintEffectNoSchedule || taint.Effect == v1.TaintEffectNoExecute + }) + if !ok { + continue + } + // Check if node is schedulable + if !IsNodeUnschedulable(node) { + klog.V(2).InfoS("Pod can possibly be scheduled on a different node", "pod", klog.KObj(pod), "node", klog.KObj(node)) + return true + } + } + return false +} + +// IsNodeUnschedulable checks if the node is unschedulable. This is a helper function to check only in case of // underutilized node so that they won't be accounted for. func IsNodeUnschedulable(node *v1.Node) bool { return node.Spec.Unschedulable diff --git a/pkg/descheduler/node/node_test.go b/pkg/descheduler/node/node_test.go index 61c91e820a..fc73c88c4d 100644 --- a/pkg/descheduler/node/node_test.go +++ b/pkg/descheduler/node/node_test.go @@ -200,10 +200,15 @@ func TestPodFitsCurrentNode(t *testing.T) { } } -func TestPodFitsAnyNode(t *testing.T) { +func TestPodFitsAnyOtherNode(t *testing.T) { nodeLabelKey := "kubernetes.io/desiredNode" nodeLabelValue := "yes" + nodeTaintKey := "hardware" + nodeTaintValue := "gpu" + + // Staging node has no scheduling restrictions, but the pod always starts here and PodFitsAnyOtherNode() doesn't take into account the node the pod is running on. + nodeNames := []string{"node1", "node2", "stagingNode"} tests := []struct { description string @@ -212,33 +217,12 @@ func TestPodFitsAnyNode(t *testing.T) { success bool }{ { - description: "Pod expected to fit one of the nodes", - pod: &v1.Pod{ - Spec: v1.PodSpec{ - Affinity: &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ - NodeSelectorTerms: []v1.NodeSelectorTerm{ - { - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: nodeLabelKey, - Operator: "In", - Values: []string{ - nodeLabelValue, - }, - }, - }, - }, - }, - }, - }, - }, - }, - }, + description: "Pod fits another node matching node affinity", + pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue), nodes: []*v1.Node{ { ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[0], Labels: map[string]string{ nodeLabelKey: nodeLabelValue, }, @@ -246,42 +230,27 @@ func TestPodFitsAnyNode(t *testing.T) { }, { ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[1], Labels: map[string]string{ nodeLabelKey: "no", }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[2], + }, + }, }, success: true, }, { - description: "Pod expected to fit one of the nodes (error node first)", - pod: &v1.Pod{ - Spec: v1.PodSpec{ - Affinity: &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ - NodeSelectorTerms: []v1.NodeSelectorTerm{ - { - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: nodeLabelKey, - Operator: "In", - Values: []string{ - nodeLabelValue, - }, - }, - }, - }, - }, - }, - }, - }, - }, - }, + description: "Pod expected to fit one of the nodes", + pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue), nodes: []*v1.Node{ { ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[0], Labels: map[string]string{ nodeLabelKey: "no", }, @@ -289,109 +258,195 @@ func TestPodFitsAnyNode(t *testing.T) { }, { ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[1], Labels: map[string]string{ nodeLabelKey: nodeLabelValue, }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[2], + }, + }, }, success: true, }, { description: "Pod expected to fit none of the nodes", - pod: &v1.Pod{ - Spec: v1.PodSpec{ - Affinity: &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ - NodeSelectorTerms: []v1.NodeSelectorTerm{ - { - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: nodeLabelKey, - Operator: "In", - Values: []string{ - nodeLabelValue, - }, - }, - }, - }, - }, - }, + pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue), + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[0], + Labels: map[string]string{ + nodeLabelKey: "unfit1", }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[1], + Labels: map[string]string{ + nodeLabelKey: "unfit2", + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[2], + }, + }, }, + success: false, + }, + { + description: "Nodes are unschedulable but labels match, should fail", + pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue), nodes: []*v1.Node{ { ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[0], Labels: map[string]string{ - nodeLabelKey: "unfit1", + nodeLabelKey: nodeLabelValue, }, }, + Spec: v1.NodeSpec{ + Unschedulable: true, + }, }, { ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[1], Labels: map[string]string{ - nodeLabelKey: "unfit2", + nodeLabelKey: "no", }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[2], + }, + }, }, success: false, }, { - description: "Nodes are unschedulable but labels match, should fail", - pod: &v1.Pod{ - Spec: v1.PodSpec{ - Affinity: &v1.Affinity{ - NodeAffinity: &v1.NodeAffinity{ - RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ - NodeSelectorTerms: []v1.NodeSelectorTerm{ - { - MatchExpressions: []v1.NodeSelectorRequirement{ - { - Key: nodeLabelKey, - Operator: "In", - Values: []string{ - nodeLabelValue, - }, - }, - }, - }, - }, + description: "Two nodes matches node selector, one of them is tained, should pass", + pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue), + nodes: []*v1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[0], + Labels: map[string]string{ + nodeLabelKey: nodeLabelValue, + }, + }, + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: nodeTaintKey, + Value: nodeTaintValue, + Effect: v1.TaintEffectNoSchedule, }, }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[1], + Labels: map[string]string{ + nodeLabelKey: nodeLabelValue, + }, + }, + }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[2], + }, + }, }, + success: true, + }, + { + description: "Both nodes are tained, should fail", + pod: createPodManifest(nodeNames[2], nodeLabelKey, nodeLabelValue), nodes: []*v1.Node{ { ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[0], Labels: map[string]string{ nodeLabelKey: nodeLabelValue, }, }, Spec: v1.NodeSpec{ - Unschedulable: true, + Taints: []v1.Taint{ + { + Key: nodeTaintKey, + Value: nodeTaintValue, + Effect: v1.TaintEffectNoSchedule, + }, + }, }, }, { ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[1], Labels: map[string]string{ - nodeLabelKey: "no", + nodeLabelKey: nodeLabelValue, + }, + }, + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: nodeTaintKey, + Value: nodeTaintValue, + Effect: v1.TaintEffectNoExecute, + }, }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: nodeNames[2], + }, + }, }, success: false, }, } for _, tc := range tests { - actual := PodFitsAnyNode(tc.pod, tc.nodes) + actual := PodFitsAnyOtherNode(tc.pod, tc.nodes) if actual != tc.success { t.Errorf("Test %#v failed", tc.description) } } } + +func createPodManifest(nodeName string, nodeSelectorKey string, nodeSelectorValue string) *v1.Pod { + return (&v1.Pod{ + Spec: v1.PodSpec{ + NodeName: nodeName, + Affinity: &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: nodeSelectorKey, + Operator: "In", + Values: []string{ + nodeSelectorValue, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }) +} diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/descheduler/strategies/duplicates.go index 380979d6d1..fe6169b0a5 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/descheduler/strategies/duplicates.go @@ -83,7 +83,12 @@ func RemoveDuplicatePods( excludedNamespaces = strategy.Params.Namespaces.Exclude } - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + nodeFit := false + if strategy.Params != nil { + nodeFit = strategy.Params.NodeFit + } + + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) duplicatePods := make(map[podOwner]map[string][]*v1.Pod) ownerKeyOccurence := make(map[podOwner]int32) diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go index fe8d4dab95..fe38936bdb 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/descheduler/strategies/duplicates_test.go @@ -47,6 +47,25 @@ func TestFindDuplicatePods(t *testing.T) { // first setup pods node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil) node2 := test.BuildTestNode("n2", 2000, 3000, 10, nil) + node3 := test.BuildTestNode("n3", 2000, 3000, 10, func(node *v1.Node) { + node.Spec.Taints = []v1.Taint{ + { + Key: "hardware", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + } + }) + node4 := test.BuildTestNode("n4", 2000, 3000, 10, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + "datacenter": "east", + } + }) + node5 := test.BuildTestNode("n5", 2000, 3000, 10, func(node *v1.Node) { + node.Spec = v1.NodeSpec{ + Unschedulable: true, + } + }) p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil) p1.Namespace = "dev" @@ -73,6 +92,14 @@ func TestFindDuplicatePods(t *testing.T) { p13.Namespace = "different-images" p14 := test.BuildTestPod("p14", 100, 0, node1.Name, nil) p14.Namespace = "different-images" + p15 := test.BuildTestPod("p15", 100, 0, node1.Name, nil) + p15.Namespace = "node-fit" + p16 := test.BuildTestPod("NOT1", 100, 0, node1.Name, nil) + p16.Namespace = "node-fit" + p17 := test.BuildTestPod("NOT2", 100, 0, node1.Name, nil) + p17.Namespace = "node-fit" + p18 := test.BuildTestPod("TARGET", 100, 0, node1.Name, nil) + p18.Namespace = "node-fit" // ### Evictable Pods ### @@ -126,10 +153,29 @@ func TestFindDuplicatePods(t *testing.T) { Image: "foo", }) + // ### Pods Evictable Based On Node Fit ### + + ownerRef3 := test.GetReplicaSetOwnerRefList() + p15.ObjectMeta.OwnerReferences = ownerRef3 + p16.ObjectMeta.OwnerReferences = ownerRef3 + p17.ObjectMeta.OwnerReferences = ownerRef3 + p18.ObjectMeta.OwnerReferences = ownerRef3 + + p15.Spec.NodeSelector = map[string]string{ + "datacenter": "west", + } + p16.Spec.NodeSelector = map[string]string{ + "datacenter": "west", + } + p17.Spec.NodeSelector = map[string]string{ + "datacenter": "west", + } + testCases := []struct { description string maxPodsToEvictPerNode int pods []v1.Pod + nodes []*v1.Node expectedEvictedPodCount int strategy api.DeschedulerStrategy }{ @@ -137,6 +183,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 1 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p1, *p2, *p3}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 1, strategy: api.DeschedulerStrategy{}, }, @@ -144,6 +191,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Three pods in the `dev` Namespace, bound to same ReplicaSet, but ReplicaSet kind is excluded. 0 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p1, *p2, *p3}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{RemoveDuplicates: &api.RemoveDuplicates{ExcludeOwnerKinds: []string{"ReplicaSet"}}}}, }, @@ -151,6 +199,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 1 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p8, *p9, *p10}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 1, strategy: api.DeschedulerStrategy{}, }, @@ -158,6 +207,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Three Pods in the `dev` Namespace, three Pods in the `test` Namespace. Bound to ReplicaSet with same name. 4 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p1, *p2, *p3, *p8, *p9, *p10}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 2, strategy: api.DeschedulerStrategy{}, }, @@ -165,6 +215,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Pods are: part of DaemonSet, with local storage, mirror pod annotation, critical pod annotation - none should be evicted.", maxPodsToEvictPerNode: 2, pods: []v1.Pod{*p4, *p5, *p6, *p7}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, strategy: api.DeschedulerStrategy{}, }, @@ -172,6 +223,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Test all Pods: 4 should be evicted.", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p1, *p2, *p3, *p4, *p5, *p6, *p7, *p8, *p9, *p10}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 2, strategy: api.DeschedulerStrategy{}, }, @@ -179,6 +231,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Pods with the same owner but different images should not be evicted", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p11, *p12}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, strategy: api.DeschedulerStrategy{}, }, @@ -186,6 +239,7 @@ func TestFindDuplicatePods(t *testing.T) { description: "Pods with multiple containers should not match themselves", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p13}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, strategy: api.DeschedulerStrategy{}, }, @@ -193,9 +247,34 @@ func TestFindDuplicatePods(t *testing.T) { description: "Pods with matching ownerrefs and at not all matching image should not trigger an eviction", maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p11, *p13}, + nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, strategy: api.DeschedulerStrategy{}, }, + { + description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. Only node available has a taint, and nodeFit set to true. 0 should be evicted.", + maxPodsToEvictPerNode: 5, + pods: []v1.Pod{*p1, *p2, *p3}, + nodes: []*v1.Node{node1, node3}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}}, + }, + { + description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet, all with a nodeSelector. Only node available has an incorrect node label, and nodeFit set to true. 0 should be evicted.", + maxPodsToEvictPerNode: 5, + pods: []v1.Pod{*p15, *p16, *p17}, + nodes: []*v1.Node{node1, node4}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}}, + }, + { + description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet. Only node available is not schedulable, and nodeFit set to true. 0 should be evicted.", + maxPodsToEvictPerNode: 5, + pods: []v1.Pod{*p1, *p2, *p3}, + nodes: []*v1.Node{node1, node5}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}}, + }, } for _, testCase := range testCases { @@ -209,13 +288,13 @@ func TestFindDuplicatePods(t *testing.T) { "v1", false, testCase.maxPodsToEvictPerNode, - []*v1.Node{node1, node2}, + testCase.nodes, false, false, false, ) - RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, []*v1.Node{node1, node2}, podEvictor) + RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor) podsEvicted := podEvictor.TotalEvicted() if podsEvicted != testCase.expectedEvictedPodCount { t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted) diff --git a/pkg/descheduler/strategies/lownodeutilization.go b/pkg/descheduler/strategies/lownodeutilization.go index f8b751a990..d53c2b9314 100644 --- a/pkg/descheduler/strategies/lownodeutilization.go +++ b/pkg/descheduler/strategies/lownodeutilization.go @@ -78,6 +78,11 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg return } + nodeFit := false + if strategy.Params != nil { + nodeFit = strategy.Params.NodeFit + } + thresholds := strategy.Params.NodeResourceUtilizationThresholds.Thresholds targetThresholds := strategy.Params.NodeResourceUtilizationThresholds.TargetThresholds if err := validateStrategyConfig(thresholds, targetThresholds); err != nil { @@ -162,7 +167,7 @@ func LowNodeUtilization(ctx context.Context, client clientset.Interface, strateg return } - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) evictPodsFromTargetNodes( ctx, diff --git a/pkg/descheduler/strategies/lownodeutilization_test.go b/pkg/descheduler/strategies/lownodeutilization_test.go index 01ba7d7c03..8accdc3b47 100644 --- a/pkg/descheduler/strategies/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/lownodeutilization_test.go @@ -48,6 +48,10 @@ func TestLowNodeUtilization(t *testing.T) { n2NodeName := "n2" n3NodeName := "n3" + nodeSelectorKey := "datacenter" + nodeSelectorValue := "west" + notMatchingNodeSelectorValue := "east" + testCases := []struct { name string thresholds, targetThresholds api.ResourceThresholds @@ -514,6 +518,227 @@ func TestLowNodeUtilization(t *testing.T) { // 0 pods available for eviction because there's no enough extended resource in node2 expectedPodsEvicted: 0, }, + { + name: "without priorities, but only other node is unschedulable", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + v1.ResourcePods: 30, + }, + targetThresholds: api.ResourceThresholds{ + v1.ResourceCPU: 50, + v1.ResourcePods: 50, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, test.SetNodeUnschedulable), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p4", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p5", 400, 0, n1NodeName, test.SetRSOwnerRef), + // These won't be evicted. + *test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef), + *test.BuildTestPod("p7", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod with local storage. + test.SetNormalOwnerRef(pod) + pod.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, + }, + }, + } + // A Mirror Pod. + pod.Annotations = test.GetMirrorPodAnnotation() + }), + *test.BuildTestPod("p8", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A Critical Pod. + pod.Namespace = "kube-system" + priority := utils.SystemCriticalPriority + pod.Spec.Priority = &priority + }), + }, + }, + n2NodeName: {}, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 0, + }, + { + name: "without priorities, but only other node doesn't match pod node selector for p4 and p5", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + v1.ResourcePods: 30, + }, + targetThresholds: api.ResourceThresholds{ + v1.ResourceCPU: 50, + v1.ResourcePods: 50, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + nodeSelectorKey: notMatchingNodeSelectorValue, + } + }), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef), + // These won't be evicted. + *test.BuildTestPod("p4", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod selecting nodes in the "west" datacenter + test.SetNormalOwnerRef(pod) + pod.Spec.NodeSelector = map[string]string{ + nodeSelectorKey: nodeSelectorValue, + } + }), + *test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod selecting nodes in the "west" datacenter + test.SetNormalOwnerRef(pod) + pod.Spec.NodeSelector = map[string]string{ + nodeSelectorKey: nodeSelectorValue, + } + }), + *test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef), + *test.BuildTestPod("p7", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod with local storage. + test.SetNormalOwnerRef(pod) + pod.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, + }, + }, + } + // A Mirror Pod. + pod.Annotations = test.GetMirrorPodAnnotation() + }), + *test.BuildTestPod("p8", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A Critical Pod. + pod.Namespace = "kube-system" + priority := utils.SystemCriticalPriority + pod.Spec.Priority = &priority + }), + }, + }, + n2NodeName: {}, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 3, + }, + { + name: "without priorities, but only other node doesn't match pod node affinity for p4 and p5", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + v1.ResourcePods: 30, + }, + targetThresholds: api.ResourceThresholds{ + v1.ResourceCPU: 50, + v1.ResourcePods: 50, + }, + nodes: map[string]*v1.Node{ + n1NodeName: test.BuildTestNode(n1NodeName, 4000, 3000, 9, nil), + n2NodeName: test.BuildTestNode(n2NodeName, 4000, 3000, 10, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + nodeSelectorKey: notMatchingNodeSelectorValue, + } + }), + }, + pods: map[string]*v1.PodList{ + n1NodeName: { + Items: []v1.Pod{ + *test.BuildTestPod("p1", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p2", 400, 0, n1NodeName, test.SetRSOwnerRef), + *test.BuildTestPod("p3", 400, 0, n1NodeName, test.SetRSOwnerRef), + // These won't be evicted. + *test.BuildTestPod("p4", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod with affinity to run in the "west" datacenter upon scheduling + test.SetNormalOwnerRef(pod) + pod.Spec.Affinity = &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: nodeSelectorKey, + Operator: "In", + Values: []string{nodeSelectorValue}, + }, + }, + }, + }, + }, + }, + } + }), + *test.BuildTestPod("p5", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod with affinity to run in the "west" datacenter upon scheduling + test.SetNormalOwnerRef(pod) + pod.Spec.Affinity = &v1.Affinity{ + NodeAffinity: &v1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{ + NodeSelectorTerms: []v1.NodeSelectorTerm{ + { + MatchExpressions: []v1.NodeSelectorRequirement{ + { + Key: nodeSelectorKey, + Operator: "In", + Values: []string{nodeSelectorValue}, + }, + }, + }, + }, + }, + }, + } + }), + *test.BuildTestPod("p6", 400, 0, n1NodeName, test.SetDSOwnerRef), + *test.BuildTestPod("p7", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A pod with local storage. + test.SetNormalOwnerRef(pod) + pod.Spec.Volumes = []v1.Volume{ + { + Name: "sample", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{Path: "somePath"}, + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: resource.NewQuantity(int64(10), resource.BinarySI)}, + }, + }, + } + // A Mirror Pod. + pod.Annotations = test.GetMirrorPodAnnotation() + }), + *test.BuildTestPod("p8", 400, 0, n1NodeName, func(pod *v1.Pod) { + // A Critical Pod. + pod.Namespace = "kube-system" + priority := utils.SystemCriticalPriority + pod.Spec.Priority = &priority + }), + }, + }, + n2NodeName: {Items: []v1.Pod{ + *test.BuildTestPod("p9", 0, 0, n2NodeName, test.SetRSOwnerRef), + }}, + }, + maxPodsToEvictPerNode: 0, + expectedPodsEvicted: 3, + }, } for _, test := range testCases { @@ -584,6 +809,7 @@ func TestLowNodeUtilization(t *testing.T) { Thresholds: test.thresholds, TargetThresholds: test.targetThresholds, }, + NodeFit: true, }, } LowNodeUtilization(ctx, fakeClient, strategy, nodes, podEvictor) @@ -854,6 +1080,7 @@ func TestWithTaints(t *testing.T) { v1.ResourcePods: 70, }, }, + NodeFit: true, }, } diff --git a/pkg/descheduler/strategies/node_affinity.go b/pkg/descheduler/strategies/node_affinity.go index 7379bc85f7..cd21b8ce36 100644 --- a/pkg/descheduler/strategies/node_affinity.go +++ b/pkg/descheduler/strategies/node_affinity.go @@ -64,7 +64,12 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter excludedNamespaces = strategy.Params.Namespaces.Exclude } - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + nodeFit := false + if strategy.Params != nil { + nodeFit = strategy.Params.NodeFit + } + + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) for _, nodeAffinity := range strategy.Params.NodeAffinityType { klog.V(2).InfoS("Executing for nodeAffinityType", "nodeAffinity", nodeAffinity) diff --git a/pkg/descheduler/strategies/node_affinity_test.go b/pkg/descheduler/strategies/node_affinity_test.go index c9b85d33aa..086a5d8e20 100644 --- a/pkg/descheduler/strategies/node_affinity_test.go +++ b/pkg/descheduler/strategies/node_affinity_test.go @@ -41,6 +41,16 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { }, } + requiredDuringSchedulingIgnoredDuringExecutionWithNodeFitStrategy := api.DeschedulerStrategy{ + Enabled: true, + Params: &api.StrategyParameters{ + NodeAffinityType: []string{ + "requiredDuringSchedulingIgnoredDuringExecution", + }, + NodeFit: true, + }, + } + nodeLabelKey := "kubernetes.io/desiredNode" nodeLabelValue := "yes" nodeWithLabels := test.BuildTestNode("nodeWithLabels", 2000, 3000, 10, nil) @@ -138,11 +148,19 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { { description: "Pod is scheduled on node without matching labels, but no node where pod fits is available, should not evict", expectedEvictedPodCount: 0, - strategy: requiredDuringSchedulingIgnoredDuringExecutionStrategy, + strategy: requiredDuringSchedulingIgnoredDuringExecutionWithNodeFitStrategy, pods: addPodsToNode(nodeWithoutLabels), nodes: []*v1.Node{nodeWithoutLabels, unschedulableNodeWithLabels}, maxPodsToEvictPerNode: 0, }, + { + description: "Pod is scheduled on node without matching labels, and node where pod fits is available, should evict", + expectedEvictedPodCount: 0, + strategy: requiredDuringSchedulingIgnoredDuringExecutionWithNodeFitStrategy, + pods: addPodsToNode(nodeWithoutLabels), + nodes: []*v1.Node{nodeWithLabels, unschedulableNodeWithLabels}, + maxPodsToEvictPerNode: 1, + }, } for _, tc := range tests { diff --git a/pkg/descheduler/strategies/node_taint.go b/pkg/descheduler/strategies/node_taint.go index 18142f2dad..f2310b514e 100644 --- a/pkg/descheduler/strategies/node_taint.go +++ b/pkg/descheduler/strategies/node_taint.go @@ -70,7 +70,12 @@ func RemovePodsViolatingNodeTaints(ctx context.Context, client clientset.Interfa return } - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + nodeFit := false + if strategy.Params != nil { + nodeFit = strategy.Params.NodeFit + } + + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) diff --git a/pkg/descheduler/strategies/node_taint_test.go b/pkg/descheduler/strategies/node_taint_test.go index 03c6491036..bbb93ed233 100644 --- a/pkg/descheduler/strategies/node_taint_test.go +++ b/pkg/descheduler/strategies/node_taint_test.go @@ -51,6 +51,17 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { node2 := test.BuildTestNode("n2", 2000, 3000, 10, nil) node1 = addTaintsToNode(node2, "testingTaint", "testing", []int{1}) + node3 := test.BuildTestNode("n3", 2000, 3000, 10, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + "datacenter": "east", + } + }) + node4 := test.BuildTestNode("n4", 2000, 3000, 10, func(node *v1.Node) { + node.Spec = v1.NodeSpec{ + Unschedulable: true, + } + }) + p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil) p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil) p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil) @@ -70,6 +81,8 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { p10 := test.BuildTestPod("p10", 100, 0, node2.Name, nil) p11 := test.BuildTestPod("p11", 100, 0, node2.Name, nil) p11.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() + p12 := test.BuildTestPod("p11", 100, 0, node2.Name, nil) + p12.ObjectMeta.OwnerReferences = test.GetNormalPodOwnerRefList() // The following 4 pods won't get evicted. // A Critical Pod. @@ -99,6 +112,10 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { p3 = addTolerationToPod(p3, "testTaint", "test", 1) p4 = addTolerationToPod(p4, "testTaintX", "testX", 1) + p12.Spec.NodeSelector = map[string]string{ + "datacenter": "west", + } + tests := []struct { description string nodes []*v1.Node @@ -107,6 +124,7 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { evictSystemCriticalPods bool maxPodsToEvictPerNode int expectedEvictedPodCount int + nodeFit bool }{ { @@ -172,6 +190,36 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { maxPodsToEvictPerNode: 0, expectedEvictedPodCount: 2, //p2 and p7 are evicted }, + { + description: "Pod p2 doesn't tolerate taint on it's node, but also doesn't tolerate taints on other nodes", + pods: []v1.Pod{*p1, *p2, *p3}, + nodes: []*v1.Node{node1, node2}, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + maxPodsToEvictPerNode: 0, + expectedEvictedPodCount: 0, //p2 gets evicted + nodeFit: true, + }, + { + description: "Pod p12 doesn't tolerate taint on it's node, but other nodes don't match it's selector", + pods: []v1.Pod{*p1, *p3, *p12}, + nodes: []*v1.Node{node1, node3}, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + maxPodsToEvictPerNode: 0, + expectedEvictedPodCount: 0, //p2 gets evicted + nodeFit: true, + }, + { + description: "Pod p2 doesn't tolerate taint on it's node, but other nodes are unschedulable", + pods: []v1.Pod{*p1, *p2, *p3}, + nodes: []*v1.Node{node1, node4}, + evictLocalStoragePods: false, + evictSystemCriticalPods: false, + maxPodsToEvictPerNode: 0, + expectedEvictedPodCount: 0, //p2 gets evicted + nodeFit: true, + }, } for _, tc := range tests { @@ -193,7 +241,13 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { false, ) - RemovePodsViolatingNodeTaints(ctx, fakeClient, api.DeschedulerStrategy{}, tc.nodes, podEvictor) + strategy := api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: tc.nodeFit, + }, + } + + RemovePodsViolatingNodeTaints(ctx, fakeClient, strategy, tc.nodes, podEvictor) actualEvictedPodCount := podEvictor.TotalEvicted() if actualEvictedPodCount != tc.expectedEvictedPodCount { t.Errorf("Test %#v failed, Unexpected no of pods evicted: pods evicted: %d, expected: %d", tc.description, actualEvictedPodCount, tc.expectedEvictedPodCount) diff --git a/pkg/descheduler/strategies/pod_antiaffinity.go b/pkg/descheduler/strategies/pod_antiaffinity.go index 9a263b75ae..6fe9d5a293 100644 --- a/pkg/descheduler/strategies/pod_antiaffinity.go +++ b/pkg/descheduler/strategies/pod_antiaffinity.go @@ -70,7 +70,12 @@ func RemovePodsViolatingInterPodAntiAffinity(ctx context.Context, client clients return } - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + nodeFit := false + if strategy.Params != nil { + nodeFit = strategy.Params.NodeFit + } + + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) diff --git a/pkg/descheduler/strategies/pod_antiaffinity_test.go b/pkg/descheduler/strategies/pod_antiaffinity_test.go index 534c52bfb9..ab2f018a78 100644 --- a/pkg/descheduler/strategies/pod_antiaffinity_test.go +++ b/pkg/descheduler/strategies/pod_antiaffinity_test.go @@ -34,16 +34,30 @@ import ( func TestPodAntiAffinity(t *testing.T) { ctx := context.Background() - node := test.BuildTestNode("n1", 2000, 3000, 10, nil) - p1 := test.BuildTestPod("p1", 100, 0, node.Name, nil) - p2 := test.BuildTestPod("p2", 100, 0, node.Name, nil) - p3 := test.BuildTestPod("p3", 100, 0, node.Name, nil) - p4 := test.BuildTestPod("p4", 100, 0, node.Name, nil) - p5 := test.BuildTestPod("p5", 100, 0, node.Name, nil) - p6 := test.BuildTestPod("p6", 100, 0, node.Name, nil) - p7 := test.BuildTestPod("p7", 100, 0, node.Name, nil) + node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil) + node2 := test.BuildTestNode("n2", 2000, 3000, 10, func(node *v1.Node) { + node.ObjectMeta.Labels = map[string]string{ + "datacenter": "east", + } + }) + + node3 := test.BuildTestNode("n3", 2000, 3000, 10, func(node *v1.Node) { + node.Spec = v1.NodeSpec{ + Unschedulable: true, + } + }) + + p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil) + p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil) + p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil) + p4 := test.BuildTestPod("p4", 100, 0, node1.Name, nil) + p5 := test.BuildTestPod("p5", 100, 0, node1.Name, nil) + p6 := test.BuildTestPod("p6", 100, 0, node1.Name, nil) + p7 := test.BuildTestPod("p7", 100, 0, node1.Name, nil) + p8 := test.BuildTestPod("p8", 100, 0, node1.Name, nil) + criticalPriority := utils.SystemCriticalPriority - nonEvictablePod := test.BuildTestPod("non-evict", 100, 0, node.Name, func(pod *v1.Pod) { + nonEvictablePod := test.BuildTestPod("non-evict", 100, 0, node1.Name, func(pod *v1.Pod) { pod.Spec.Priority = &criticalPriority }) p2.Labels = map[string]string{"foo": "bar"} @@ -72,36 +86,70 @@ func TestPodAntiAffinity(t *testing.T) { test.SetPodPriority(p6, 50) test.SetPodPriority(p7, 0) + // Set pod node selectors + p8.Spec.NodeSelector = map[string]string{ + "datacenter": "west", + } + tests := []struct { description string maxPodsToEvictPerNode int pods []v1.Pod expectedEvictedPodCount int + nodeFit bool + nodes []*v1.Node }{ { description: "Maximum pods to evict - 0", maxPodsToEvictPerNode: 0, pods: []v1.Pod{*p1, *p2, *p3, *p4}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 3, }, { description: "Maximum pods to evict - 3", maxPodsToEvictPerNode: 3, pods: []v1.Pod{*p1, *p2, *p3, *p4}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 3, }, { description: "Evict only 1 pod after sorting", maxPodsToEvictPerNode: 0, pods: []v1.Pod{*p5, *p6, *p7}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, }, { description: "Evicts pod that conflicts with critical pod (but does not evict critical pod)", maxPodsToEvictPerNode: 1, pods: []v1.Pod{*p1, *nonEvictablePod}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, }, + { + description: "Evicts pod that conflicts with critical pod (but does not evict critical pod)", + maxPodsToEvictPerNode: 1, + pods: []v1.Pod{*p1, *nonEvictablePod}, + nodes: []*v1.Node{node1}, + expectedEvictedPodCount: 1, + }, + { + description: "Won't evict pods because node selectors don't match available nodes", + maxPodsToEvictPerNode: 1, + pods: []v1.Pod{*p8, *nonEvictablePod}, + nodes: []*v1.Node{node1, node2}, + expectedEvictedPodCount: 0, + nodeFit: true, + }, + { + description: "Won't evict pods because only other node is not schedulable", + maxPodsToEvictPerNode: 1, + pods: []v1.Pod{*p8, *nonEvictablePod}, + nodes: []*v1.Node{node1, node3}, + expectedEvictedPodCount: 0, + nodeFit: true, + }, } for _, test := range tests { @@ -111,7 +159,7 @@ func TestPodAntiAffinity(t *testing.T) { return true, &v1.PodList{Items: test.pods}, nil }) fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { - return true, node, nil + return true, node1, nil }) podEvictor := evictions.NewPodEvictor( @@ -119,13 +167,18 @@ func TestPodAntiAffinity(t *testing.T) { policyv1.SchemeGroupVersion.String(), false, test.maxPodsToEvictPerNode, - []*v1.Node{node}, + test.nodes, false, false, false, ) + strategy := api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: test.nodeFit, + }, + } - RemovePodsViolatingInterPodAntiAffinity(ctx, fakeClient, api.DeschedulerStrategy{}, []*v1.Node{node}, podEvictor) + RemovePodsViolatingInterPodAntiAffinity(ctx, fakeClient, strategy, test.nodes, podEvictor) podsEvicted := podEvictor.TotalEvicted() if podsEvicted != test.expectedEvictedPodCount { t.Errorf("Unexpected no of pods evicted: pods evicted: %d, expected: %d", podsEvicted, test.expectedEvictedPodCount) diff --git a/pkg/descheduler/strategies/pod_lifetime_test.go b/pkg/descheduler/strategies/pod_lifetime_test.go index 2f9754223c..a4b2f165f7 100644 --- a/pkg/descheduler/strategies/pod_lifetime_test.go +++ b/pkg/descheduler/strategies/pod_lifetime_test.go @@ -34,15 +34,15 @@ import ( func TestPodLifeTime(t *testing.T) { ctx := context.Background() - node := test.BuildTestNode("n1", 2000, 3000, 10, nil) + node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil) olderPodCreationTime := metav1.NewTime(time.Date(2009, time.November, 10, 23, 0, 0, 0, time.UTC)) newerPodCreationTime := metav1.NewTime(time.Now()) // Setup pods, one should be evicted - p1 := test.BuildTestPod("p1", 100, 0, node.Name, nil) + p1 := test.BuildTestPod("p1", 100, 0, node1.Name, nil) p1.Namespace = "dev" p1.ObjectMeta.CreationTimestamp = newerPodCreationTime - p2 := test.BuildTestPod("p2", 100, 0, node.Name, nil) + p2 := test.BuildTestPod("p2", 100, 0, node1.Name, nil) p2.Namespace = "dev" p2.ObjectMeta.CreationTimestamp = olderPodCreationTime @@ -51,10 +51,10 @@ func TestPodLifeTime(t *testing.T) { p2.ObjectMeta.OwnerReferences = ownerRef1 // Setup pods, zero should be evicted - p3 := test.BuildTestPod("p3", 100, 0, node.Name, nil) + p3 := test.BuildTestPod("p3", 100, 0, node1.Name, nil) p3.Namespace = "dev" p3.ObjectMeta.CreationTimestamp = newerPodCreationTime - p4 := test.BuildTestPod("p4", 100, 0, node.Name, nil) + p4 := test.BuildTestPod("p4", 100, 0, node1.Name, nil) p4.Namespace = "dev" p4.ObjectMeta.CreationTimestamp = newerPodCreationTime @@ -63,10 +63,10 @@ func TestPodLifeTime(t *testing.T) { p4.ObjectMeta.OwnerReferences = ownerRef2 // Setup pods, one should be evicted - p5 := test.BuildTestPod("p5", 100, 0, node.Name, nil) + p5 := test.BuildTestPod("p5", 100, 0, node1.Name, nil) p5.Namespace = "dev" p5.ObjectMeta.CreationTimestamp = newerPodCreationTime - p6 := test.BuildTestPod("p6", 100, 0, node.Name, nil) + p6 := test.BuildTestPod("p6", 100, 0, node1.Name, nil) p6.Namespace = "dev" p6.ObjectMeta.CreationTimestamp = metav1.NewTime(time.Now().Add(time.Second * 605)) @@ -75,10 +75,10 @@ func TestPodLifeTime(t *testing.T) { p6.ObjectMeta.OwnerReferences = ownerRef3 // Setup pods, zero should be evicted - p7 := test.BuildTestPod("p7", 100, 0, node.Name, nil) + p7 := test.BuildTestPod("p7", 100, 0, node1.Name, nil) p7.Namespace = "dev" p7.ObjectMeta.CreationTimestamp = newerPodCreationTime - p8 := test.BuildTestPod("p8", 100, 0, node.Name, nil) + p8 := test.BuildTestPod("p8", 100, 0, node1.Name, nil) p8.Namespace = "dev" p8.ObjectMeta.CreationTimestamp = metav1.NewTime(time.Now().Add(time.Second * 595)) @@ -87,10 +87,10 @@ func TestPodLifeTime(t *testing.T) { p6.ObjectMeta.OwnerReferences = ownerRef4 // Setup two old pods with different status phases - p9 := test.BuildTestPod("p9", 100, 0, node.Name, nil) + p9 := test.BuildTestPod("p9", 100, 0, node1.Name, nil) p9.Namespace = "dev" p9.ObjectMeta.CreationTimestamp = olderPodCreationTime - p10 := test.BuildTestPod("p10", 100, 0, node.Name, nil) + p10 := test.BuildTestPod("p10", 100, 0, node1.Name, nil) p10.Namespace = "dev" p10.ObjectMeta.CreationTimestamp = olderPodCreationTime @@ -99,7 +99,7 @@ func TestPodLifeTime(t *testing.T) { p9.ObjectMeta.OwnerReferences = ownerRef1 p10.ObjectMeta.OwnerReferences = ownerRef1 - p11 := test.BuildTestPod("p11", 100, 0, node.Name, func(pod *v1.Pod) { + p11 := test.BuildTestPod("p11", 100, 0, node1.Name, func(pod *v1.Pod) { pod.Spec.Volumes = []v1.Volume{ { Name: "pvc", VolumeSource: v1.VolumeSource{ @@ -113,10 +113,10 @@ func TestPodLifeTime(t *testing.T) { }) // Setup two old pods with different labels - p12 := test.BuildTestPod("p12", 100, 0, node.Name, nil) + p12 := test.BuildTestPod("p12", 100, 0, node1.Name, nil) p12.Namespace = "dev" p12.ObjectMeta.CreationTimestamp = olderPodCreationTime - p13 := test.BuildTestPod("p13", 100, 0, node.Name, nil) + p13 := test.BuildTestPod("p13", 100, 0, node1.Name, nil) p13.Namespace = "dev" p13.ObjectMeta.CreationTimestamp = olderPodCreationTime @@ -131,6 +131,7 @@ func TestPodLifeTime(t *testing.T) { strategy api.DeschedulerStrategy maxPodsToEvictPerNode int pods []v1.Pod + nodes []*v1.Node expectedEvictedPodCount int ignorePvcPods bool }{ @@ -144,6 +145,7 @@ func TestPodLifeTime(t *testing.T) { }, maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p1, *p2}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, }, { @@ -156,6 +158,7 @@ func TestPodLifeTime(t *testing.T) { }, maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p3, *p4}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 0, }, { @@ -168,6 +171,7 @@ func TestPodLifeTime(t *testing.T) { }, maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p5, *p6}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, }, { @@ -180,6 +184,7 @@ func TestPodLifeTime(t *testing.T) { }, maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p7, *p8}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 0, }, { @@ -195,6 +200,7 @@ func TestPodLifeTime(t *testing.T) { }, maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p9, *p10}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, }, { @@ -207,6 +213,7 @@ func TestPodLifeTime(t *testing.T) { }, maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p11}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 0, ignorePvcPods: true, }, @@ -220,6 +227,7 @@ func TestPodLifeTime(t *testing.T) { }, maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p11}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, }, { @@ -235,30 +243,30 @@ func TestPodLifeTime(t *testing.T) { }, maxPodsToEvictPerNode: 5, pods: []v1.Pod{*p12, *p13}, + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, }, } for _, tc := range testCases { fakeClient := &fake.Clientset{} + fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { return true, &v1.PodList{Items: tc.pods}, nil }) - fakeClient.Fake.AddReactor("get", "nodes", func(action core.Action) (bool, runtime.Object, error) { - return true, node, nil - }) + podEvictor := evictions.NewPodEvictor( fakeClient, policyv1.SchemeGroupVersion.String(), false, tc.maxPodsToEvictPerNode, - []*v1.Node{node}, + tc.nodes, false, false, tc.ignorePvcPods, ) - PodLifeTime(ctx, fakeClient, tc.strategy, []*v1.Node{node}, podEvictor) + PodLifeTime(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor) podsEvicted := podEvictor.TotalEvicted() if podsEvicted != tc.expectedEvictedPodCount { t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", tc.description, tc.expectedEvictedPodCount, podsEvicted) diff --git a/pkg/descheduler/strategies/toomanyrestarts.go b/pkg/descheduler/strategies/toomanyrestarts.go index de79c88f01..11dddc4029 100644 --- a/pkg/descheduler/strategies/toomanyrestarts.go +++ b/pkg/descheduler/strategies/toomanyrestarts.go @@ -67,7 +67,12 @@ func RemovePodsHavingTooManyRestarts(ctx context.Context, client clientset.Inter excludedNamespaces = strategy.Params.Namespaces.Exclude } - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) + nodeFit := false + if strategy.Params != nil { + nodeFit = strategy.Params.NodeFit + } + + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) diff --git a/pkg/descheduler/strategies/toomanyrestarts_test.go b/pkg/descheduler/strategies/toomanyrestarts_test.go index 5675786b53..a76e0bf1b1 100644 --- a/pkg/descheduler/strategies/toomanyrestarts_test.go +++ b/pkg/descheduler/strategies/toomanyrestarts_test.go @@ -82,7 +82,26 @@ func initPods(node *v1.Node) []v1.Pod { func TestRemovePodsHavingTooManyRestarts(t *testing.T) { ctx := context.Background() - createStrategy := func(enabled, includingInitContainers bool, restartThresholds int32) api.DeschedulerStrategy { + + node1 := test.BuildTestNode("node1", 2000, 3000, 10, nil) + node2 := test.BuildTestNode("node2", 2000, 3000, 10, func(node *v1.Node) { + node.Spec.Taints = []v1.Taint{ + { + Key: "hardware", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + } + }) + node3 := test.BuildTestNode("node3", 2000, 3000, 10, func(node *v1.Node) { + node.Spec = v1.NodeSpec{ + Unschedulable: true, + } + }) + + pods := initPods(node1) + + createStrategy := func(enabled, includingInitContainers bool, restartThresholds int32, nodeFit bool) api.DeschedulerStrategy { return api.DeschedulerStrategy{ Enabled: enabled, Params: &api.StrategyParameters{ @@ -90,76 +109,98 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) { PodRestartThreshold: restartThresholds, IncludingInitContainers: includingInitContainers, }, + NodeFit: nodeFit, }, } } tests := []struct { description string - pods []v1.Pod + nodes []*v1.Node strategy api.DeschedulerStrategy expectedEvictedPodCount int maxPodsToEvictPerNode int }{ { description: "All pods have total restarts under threshold, no pod evictions", - strategy: createStrategy(true, true, 10000), + strategy: createStrategy(true, true, 10000, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 0, maxPodsToEvictPerNode: 0, }, { description: "Some pods have total restarts bigger than threshold", - strategy: createStrategy(true, true, 1), + strategy: createStrategy(true, true, 1, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 6, maxPodsToEvictPerNode: 0, }, { - description: "Nine pods have total restarts equals threshold(includingInitContainers=true), 6 pods evictions", - strategy: createStrategy(true, true, 1*25), + description: "Nine pods have total restarts equals threshold(includingInitContainers=true), 6 pod evictions", + strategy: createStrategy(true, true, 1*25, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 6, maxPodsToEvictPerNode: 0, }, { - description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 5 pods evictions", - strategy: createStrategy(true, false, 1*25), + description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 5 pod evictions", + strategy: createStrategy(true, false, 1*25, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 5, maxPodsToEvictPerNode: 0, }, { - description: "All pods have total restarts equals threshold(includingInitContainers=true), 6 pods evictions", - strategy: createStrategy(true, true, 1*20), + description: "All pods have total restarts equals threshold(includingInitContainers=true), 6 pod evictions", + strategy: createStrategy(true, true, 1*20, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 6, maxPodsToEvictPerNode: 0, }, { - description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 6 pods evictions", - strategy: createStrategy(true, false, 1*20), + description: "Nine pods have total restarts equals threshold(includingInitContainers=false), 6 pod evictions", + strategy: createStrategy(true, false, 1*20, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 6, maxPodsToEvictPerNode: 0, }, { description: "Five pods have total restarts bigger than threshold(includingInitContainers=true), but only 1 pod eviction", - strategy: createStrategy(true, true, 5*25+1), + strategy: createStrategy(true, true, 5*25+1, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, maxPodsToEvictPerNode: 0, }, { description: "Five pods have total restarts bigger than threshold(includingInitContainers=false), but only 1 pod eviction", - strategy: createStrategy(true, false, 5*20+1), + strategy: createStrategy(true, false, 5*20+1, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 1, maxPodsToEvictPerNode: 0, }, { - description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3), 3 pods evictions", - strategy: createStrategy(true, true, 1), + description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3), 3 pod evictions", + strategy: createStrategy(true, true, 1, false), + nodes: []*v1.Node{node1}, expectedEvictedPodCount: 3, maxPodsToEvictPerNode: 3, }, + { + description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node is tained, 0 pod evictions", + strategy: createStrategy(true, true, 1, true), + nodes: []*v1.Node{node1, node2}, + expectedEvictedPodCount: 0, + maxPodsToEvictPerNode: 3, + }, + { + description: "All pods have total restarts equals threshold(maxPodsToEvictPerNode=3) but the only other node is not schedulable, 0 pod evictions", + strategy: createStrategy(true, true, 1, true), + nodes: []*v1.Node{node1, node3}, + expectedEvictedPodCount: 0, + maxPodsToEvictPerNode: 3, + }, } for _, tc := range tests { - node := test.BuildTestNode("node1", 2000, 3000, 10, nil) - pods := initPods(node) fakeClient := &fake.Clientset{} fakeClient.Fake.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { @@ -171,13 +212,13 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) { policyv1.SchemeGroupVersion.String(), false, tc.maxPodsToEvictPerNode, - []*v1.Node{node}, + tc.nodes, false, false, false, ) - RemovePodsHavingTooManyRestarts(ctx, fakeClient, tc.strategy, []*v1.Node{node}, podEvictor) + RemovePodsHavingTooManyRestarts(ctx, fakeClient, tc.strategy, tc.nodes, podEvictor) actualEvictedPodCount := podEvictor.TotalEvicted() if actualEvictedPodCount != tc.expectedEvictedPodCount { t.Errorf("Test %#v failed, expected %v pod evictions, but got %v pod evictions\n", tc.description, tc.expectedEvictedPodCount, actualEvictedPodCount) diff --git a/pkg/descheduler/strategies/topologyspreadconstraint.go b/pkg/descheduler/strategies/topologyspreadconstraint.go index b408ce16d6..c2bfd1cf05 100644 --- a/pkg/descheduler/strategies/topologyspreadconstraint.go +++ b/pkg/descheduler/strategies/topologyspreadconstraint.go @@ -19,10 +19,11 @@ package strategies import ( "context" "fmt" - utilerrors "k8s.io/apimachinery/pkg/util/errors" "math" "sort" + utilerrors "k8s.io/apimachinery/pkg/util/errors" + v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -83,11 +84,17 @@ func RemovePodsViolatingTopologySpreadConstraint( return } + nodeFit := false + if strategy.Params != nil { + nodeFit = strategy.Params.NodeFit + } + + evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) + nodeMap := make(map[string]*v1.Node, len(nodes)) for _, node := range nodes { nodeMap[node.Name] = node } - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority)) // 1. for each namespace for which there is Topology Constraint // 2. for each TopologySpreadConstraint in that namespace @@ -243,6 +250,7 @@ func balanceDomains( sumPods float64, isEvictable func(*v1.Pod) bool, nodeMap map[string]*v1.Node) { + idealAvg := sumPods / float64(len(constraintTopologies)) sortedDomains := sortDomains(constraintTopologies, isEvictable) // i is the index for belowOrEqualAvg diff --git a/pkg/descheduler/strategies/topologyspreadconstraint_test.go b/pkg/descheduler/strategies/topologyspreadconstraint_test.go index 5eb256c701..2d605fa041 100644 --- a/pkg/descheduler/strategies/topologyspreadconstraint_test.go +++ b/pkg/descheduler/strategies/topologyspreadconstraint_test.go @@ -58,8 +58,12 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 0, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: false, + }, + }, + namespaces: []string{"ns1"}, }, { name: "2 domains, sizes [3,1], maxSkew=1, move 1 pod to achieve [2,2]", @@ -93,8 +97,12 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 1, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: false, + }, + }, + namespaces: []string{"ns1"}, }, { name: "2 domains, sizes [3,1], maxSkew=1, move 1 pod to achieve [2,2] (soft constraints)", @@ -166,8 +174,12 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 0, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: false, + }, + }, + namespaces: []string{"ns1"}, }, { name: "2 domains, sizes [3,1], maxSkew=1, move 1 pod to achieve [2,2], exclude kube-system namespace", @@ -201,7 +213,7 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 1, - strategy: api.DeschedulerStrategy{Enabled: true, Params: &api.StrategyParameters{Namespaces: &api.Namespaces{Exclude: []string{"kube-system"}}}}, + strategy: api.DeschedulerStrategy{Enabled: true, Params: &api.StrategyParameters{NodeFit: true, Namespaces: &api.Namespaces{Exclude: []string{"kube-system"}}}}, namespaces: []string{"ns1"}, }, { @@ -236,8 +248,12 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 1, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: false, + }, + }, + namespaces: []string{"ns1"}, }, { name: "2 domains, sizes [4,0], maxSkew=1, move 2 pods to achieve [2,2]", @@ -266,8 +282,12 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 2, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: false, + }, + }, + namespaces: []string{"ns1"}, }, { name: "2 domains, sizes [4,0], maxSkew=1, only move 1 pod since pods with nodeSelector and nodeAffinity aren't evicted", @@ -313,8 +333,12 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 1, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: true, + }, + }, + namespaces: []string{"ns1"}, }, { name: "2 domains, sizes [4,0], maxSkew=1, move 2 pods since selector matches multiple nodes", @@ -363,8 +387,12 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 2, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: false, + }, + }, + namespaces: []string{"ns1"}, }, { name: "3 domains, sizes [0, 1, 100], maxSkew=1, move 66 pods to get [34, 33, 34]", @@ -431,8 +459,12 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 3, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: false, + }, + }, + namespaces: []string{"ns1"}, }, { name: "2 domains size [2 6], maxSkew=2, should move 1 to get [3 5]", @@ -466,8 +498,60 @@ func TestTopologySpreadConstraint(t *testing.T) { }, }), expectedEvictedCount: 1, - strategy: api.DeschedulerStrategy{}, - namespaces: []string{"ns1"}, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: false, + }, + }, + namespaces: []string{"ns1"}, + }, + { + name: "2 domains size [2 6], maxSkew=2, can't move any because of node taints", + nodes: []*v1.Node{ + test.BuildTestNode("n1", 2000, 3000, 10, func(n *v1.Node) { + n.Labels["zone"] = "zoneA" + n.Spec.Taints = []v1.Taint{ + { + Key: "hardware", + Value: "gpu", + Effect: v1.TaintEffectNoSchedule, + }, + } + }), + test.BuildTestNode("n2", 2000, 3000, 10, func(n *v1.Node) { n.Labels["zone"] = "zoneB" }), + }, + pods: createTestPods([]testPodList{ + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + constraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 2, + TopologyKey: "zone", + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"foo": "bar"}}, + }, + }, + }, + { + count: 1, + node: "n1", + labels: map[string]string{"foo": "bar"}, + }, + { + count: 6, + node: "n2", + labels: map[string]string{"foo": "bar"}, + }, + }), + expectedEvictedCount: 0, + strategy: api.DeschedulerStrategy{ + Params: &api.StrategyParameters{ + NodeFit: true, + }, + }, + namespaces: []string{"ns1"}, }, { // see https://github.com/kubernetes-sigs/descheduler/issues/564