From 2976e15cc3d5af14b6894b4cc1103db16d4a83bb Mon Sep 17 00:00:00 2001 From: Jan Chaloupka Date: Wed, 6 Apr 2022 23:58:21 +0200 Subject: [PATCH] Descheduling framework PoC: flip RemoveDuplicatePods to a plugin --- pkg/api/types.go | 5 + .../removeduplicatepods.go} | 126 ++++++++++-------- .../removeduplicatepods_test.go} | 71 +++++++--- pkg/framework/types.go | 50 +++++++ pkg/utils/priority.go | 20 +++ 5 files changed, 198 insertions(+), 74 deletions(-) rename pkg/{descheduler/strategies/duplicates.go => framework/plugins/removeduplicatepods/removeduplicatepods.go} (78%) rename pkg/{descheduler/strategies/duplicates_test.go => framework/plugins/removeduplicatepods/removeduplicatepods_test.go} (93%) create mode 100644 pkg/framework/types.go diff --git a/pkg/api/types.go b/pkg/api/types.go index ef4c651232..55bd6ca77a 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -72,6 +72,11 @@ type Namespaces struct { Exclude []string } +type PriorityThreshold struct { + Value *int32 + Name string +} + // Besides Namespaces only one of its members may be specified // TODO(jchaloup): move Namespaces ThresholdPriority and ThresholdPriorityClassName to individual strategies // once the policy version is bumped to v1alpha2 diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/framework/plugins/removeduplicatepods/removeduplicatepods.go similarity index 78% rename from pkg/descheduler/strategies/duplicates.go rename to pkg/framework/plugins/removeduplicatepods/removeduplicatepods.go index 462fb36f6e..39e4ee35db 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/framework/plugins/removeduplicatepods/removeduplicatepods.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package strategies +package removeduplicatepods import ( "context" @@ -26,75 +26,61 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/sets" - clientset "k8s.io/client-go/kubernetes" "k8s.io/klog/v2" - "sigs.k8s.io/descheduler/pkg/api" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/framework" "sigs.k8s.io/descheduler/pkg/utils" ) -func validateRemoveDuplicatePodsParams(params *api.StrategyParameters) error { - if params == nil { - return nil - } - // At most one of include/exclude can be set - if params.Namespaces != nil && len(params.Namespaces.Include) > 0 && len(params.Namespaces.Exclude) > 0 { - return fmt.Errorf("only one of Include/Exclude namespaces can be set") - } - if params.ThresholdPriority != nil && params.ThresholdPriorityClassName != "" { - return fmt.Errorf("only one of thresholdPriority and thresholdPriorityClassName can be set") - } - - return nil -} - -type podOwner struct { - namespace, kind, name string - imagesHash string -} +const PluginName = "RemoveDuplicatePods" // RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node. // A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same // namespace, and have at least one container with the same image. // As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages. -func RemoveDuplicatePods( - ctx context.Context, - client clientset.Interface, - strategy api.DeschedulerStrategy, - nodes []*v1.Node, - podEvictor *evictions.PodEvictor, - getPodsAssignedToNode podutil.GetPodsAssignedToNodeFunc, -) { - if err := validateRemoveDuplicatePodsParams(strategy.Params); err != nil { - klog.ErrorS(err, "Invalid RemoveDuplicatePods parameters") - return - } - thresholdPriority, err := utils.GetPriorityFromStrategyParams(ctx, client, strategy.Params) - if err != nil { - klog.ErrorS(err, "Failed to get threshold priority from strategy's params") - return +type RemoveDuplicatePods struct { + handle framework.Handle + args *framework.RemoveDuplicatePodsArg + excludeOwnerKinds []string + podFilter podutil.FilterFunc +} + +var _ framework.Plugin = &RemoveDuplicatePods{} +var _ framework.DeschedulePlugin = &RemoveDuplicatePods{} + +func New(args runtime.Object, handle framework.Handle) (framework.Plugin, error) { + duplicatesArg, ok := args.(*framework.RemoveDuplicatePodsArg) + if !ok { + return nil, fmt.Errorf("want args to be of type RemoveDuplicatePodsArg, got %T", args) } - var includedNamespaces, excludedNamespaces sets.String - if strategy.Params != nil && strategy.Params.Namespaces != nil { - includedNamespaces = sets.NewString(strategy.Params.Namespaces.Include...) - excludedNamespaces = sets.NewString(strategy.Params.Namespaces.Exclude...) + // At most one of include/exclude can be set + if duplicatesArg.Namespaces != nil && len(duplicatesArg.Namespaces.Include) > 0 && len(duplicatesArg.Namespaces.Exclude) > 0 { + return nil, fmt.Errorf("only one of Include/Exclude namespaces can be set") + } + if duplicatesArg.PriorityThreshold != nil && duplicatesArg.PriorityThreshold.Value != nil && duplicatesArg.PriorityThreshold.Name != "" { + return nil, fmt.Errorf("only one of priorityThreshold fields can be set") } - nodeFit := false - if strategy.Params != nil { - nodeFit = strategy.Params.NodeFit + thresholdPriority, err := utils.GetPriorityValueFromPriorityThreshold(context.TODO(), handle.ClientSet(), duplicatesArg.PriorityThreshold) + if err != nil { + return nil, fmt.Errorf("failed to get priority threshold: %v", err) } - evictable := podEvictor.Evictable(evictions.WithPriorityThreshold(thresholdPriority), evictions.WithNodeFit(nodeFit)) + evictable := handle.PodEvictor().Evictable( + evictions.WithPriorityThreshold(thresholdPriority), + evictions.WithNodeFit(duplicatesArg.NodeFit), + ) - duplicatePods := make(map[podOwner]map[string][]*v1.Pod) - ownerKeyOccurence := make(map[podOwner]int32) - nodeCount := 0 - nodeMap := make(map[string]*v1.Node) + var includedNamespaces, excludedNamespaces sets.String + if duplicatesArg.Namespaces != nil { + includedNamespaces = sets.NewString(duplicatesArg.Namespaces.Include...) + excludedNamespaces = sets.NewString(duplicatesArg.Namespaces.Exclude...) + } podFilter, err := podutil.NewOptions(). WithFilter(evictable.IsEvictable). @@ -102,13 +88,34 @@ func RemoveDuplicatePods( WithoutNamespaces(excludedNamespaces). BuildFilterFunc() if err != nil { - klog.ErrorS(err, "Error initializing pod filter function") - return + return nil, fmt.Errorf("error initializing pod filter function: %v", err) } + return &RemoveDuplicatePods{ + handle: handle, + excludeOwnerKinds: duplicatesArg.ExcludeOwnerKinds, + podFilter: podFilter, + }, nil +} + +func (d *RemoveDuplicatePods) Name() string { + return PluginName +} + +type podOwner struct { + namespace, kind, name string + imagesHash string +} + +func (d *RemoveDuplicatePods) Deschedule(ctx context.Context, nodes []*v1.Node) *framework.Status { + duplicatePods := make(map[podOwner]map[string][]*v1.Pod) + ownerKeyOccurence := make(map[podOwner]int32) + nodeCount := 0 + nodeMap := make(map[string]*v1.Node) + for _, node := range nodes { klog.V(1).InfoS("Processing node", "node", klog.KObj(node)) - pods, err := podutil.ListPodsOnANode(node.Name, getPodsAssignedToNode, podFilter) + pods, err := podutil.ListPodsOnANode(node.Name, d.handle.GetPodsAssignedToNodeFunc(), d.podFilter) if err != nil { klog.ErrorS(err, "Error listing evictable pods on node", "node", klog.KObj(node)) continue @@ -131,7 +138,7 @@ func RemoveDuplicatePods( duplicateKeysMap := map[string][][]string{} for _, pod := range pods { ownerRefList := podutil.OwnerRef(pod) - if hasExcludedOwnerRefKind(ownerRefList, strategy) || len(ownerRefList) == 0 { + if len(ownerRefList) == 0 || hasExcludedOwnerRefKind(ownerRefList, d.excludeOwnerKinds) { continue } podContainerKeys := make([]string, 0, len(ownerRefList)*len(pod.Spec.Containers)) @@ -210,7 +217,7 @@ func RemoveDuplicatePods( // It's assumed all duplicated pods are in the same priority class // TODO(jchaloup): check if the pod has a different node to lend to for _, pod := range pods[upperAvg-1:] { - if _, err := podEvictor.EvictPod(ctx, pod, nodeMap[nodeName], "RemoveDuplicatePods"); err != nil { + if _, err := d.handle.PodEvictor().EvictPod(ctx, pod, nodeMap[nodeName], "RemoveDuplicatePods"); err != nil { klog.ErrorS(err, "Error evicting pod", "pod", klog.KObj(pod)) break } @@ -218,6 +225,8 @@ func RemoveDuplicatePods( } } } + + return nil } func getNodeAffinityNodeSelector(pod *v1.Pod) *v1.NodeSelector { @@ -287,11 +296,12 @@ func getTargetNodes(podNodes map[string][]*v1.Pod, nodes []*v1.Node) []*v1.Node return targetNodes } -func hasExcludedOwnerRefKind(ownerRefs []metav1.OwnerReference, strategy api.DeschedulerStrategy) bool { - if strategy.Params == nil || strategy.Params.RemoveDuplicates == nil { +func hasExcludedOwnerRefKind(ownerRefs []metav1.OwnerReference, ExcludeOwnerKinds []string) bool { + if len(ExcludeOwnerKinds) == 0 { return false } - exclude := sets.NewString(strategy.Params.RemoveDuplicates.ExcludeOwnerKinds...) + + exclude := sets.NewString(ExcludeOwnerKinds...) for _, owner := range ownerRefs { if exclude.Has(owner.Kind) { return true diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/framework/plugins/removeduplicatepods/removeduplicatepods_test.go similarity index 93% rename from pkg/descheduler/strategies/duplicates_test.go rename to pkg/framework/plugins/removeduplicatepods/removeduplicatepods_test.go index f2b6467d30..bb62e631fb 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/framework/plugins/removeduplicatepods/removeduplicatepods_test.go @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package strategies +package removeduplicatepods import ( "context" @@ -26,11 +26,13 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/informers" + clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/fake" "sigs.k8s.io/descheduler/pkg/api" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" + "sigs.k8s.io/descheduler/pkg/framework" "sigs.k8s.io/descheduler/pkg/utils" "sigs.k8s.io/descheduler/test" ) @@ -44,6 +46,22 @@ func buildTestPodWithImage(podName, node, image string) *v1.Pod { return pod } +type frameworkHandle struct { + clientset clientset.Interface + podEvictor *evictions.PodEvictor + getPodsAssignedToNodeFunc podutil.GetPodsAssignedToNodeFunc +} + +func (f frameworkHandle) ClientSet() clientset.Interface { + return f.clientset +} +func (f frameworkHandle) PodEvictor() *evictions.PodEvictor { + return f.podEvictor +} +func (f frameworkHandle) GetPodsAssignedToNodeFunc() podutil.GetPodsAssignedToNodeFunc { + return f.getPodsAssignedToNodeFunc +} + func TestFindDuplicatePods(t *testing.T) { // first setup pods node1 := test.BuildTestNode("n1", 2000, 3000, 10, nil) @@ -177,91 +195,85 @@ func TestFindDuplicatePods(t *testing.T) { pods []*v1.Pod nodes []*v1.Node expectedEvictedPodCount uint - strategy api.DeschedulerStrategy + + nodeFit bool + excludeOwnerKinds []string }{ { description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 1 should be evicted.", pods: []*v1.Pod{p1, p2, p3}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 1, - strategy: api.DeschedulerStrategy{}, }, { description: "Three pods in the `dev` Namespace, bound to same ReplicaSet, but ReplicaSet kind is excluded. 0 should be evicted.", pods: []*v1.Pod{p1, p2, p3}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, - strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{RemoveDuplicates: &api.RemoveDuplicates{ExcludeOwnerKinds: []string{"ReplicaSet"}}}}, + excludeOwnerKinds: []string{"ReplicaSet"}, }, { description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 1 should be evicted.", pods: []*v1.Pod{p8, p9, p10}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 1, - strategy: api.DeschedulerStrategy{}, }, { description: "Three Pods in the `dev` Namespace, three Pods in the `test` Namespace. Bound to ReplicaSet with same name. 4 should be evicted.", pods: []*v1.Pod{p1, p2, p3, p8, p9, p10}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 2, - strategy: api.DeschedulerStrategy{}, }, { description: "Pods are: part of DaemonSet, with local storage, mirror pod annotation, critical pod annotation - none should be evicted.", pods: []*v1.Pod{p4, p5, p6, p7}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, - strategy: api.DeschedulerStrategy{}, }, { description: "Test all Pods: 4 should be evicted.", pods: []*v1.Pod{p1, p2, p3, p4, p5, p6, p7, p8, p9, p10}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 2, - strategy: api.DeschedulerStrategy{}, }, { description: "Pods with the same owner but different images should not be evicted", pods: []*v1.Pod{p11, p12}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, - strategy: api.DeschedulerStrategy{}, }, { description: "Pods with multiple containers should not match themselves", pods: []*v1.Pod{p13}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, - strategy: api.DeschedulerStrategy{}, }, { description: "Pods with matching ownerrefs and at not all matching image should not trigger an eviction", pods: []*v1.Pod{p11, p13}, nodes: []*v1.Node{node1, node2}, expectedEvictedPodCount: 0, - strategy: api.DeschedulerStrategy{}, }, { description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. Only node available has a taint, and nodeFit set to true. 0 should be evicted.", pods: []*v1.Pod{p1, p2, p3}, nodes: []*v1.Node{node1, node3}, expectedEvictedPodCount: 0, - strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}}, + nodeFit: true, }, { description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet, all with a nodeSelector. Only node available has an incorrect node label, and nodeFit set to true. 0 should be evicted.", pods: []*v1.Pod{p15, p16, p17}, nodes: []*v1.Node{node1, node4}, expectedEvictedPodCount: 0, - strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}}, + nodeFit: true, }, { description: "Three pods in the `node-fit` Namespace, bound to same ReplicaSet. Only node available is not schedulable, and nodeFit set to true. 0 should be evicted.", pods: []*v1.Pod{p1, p2, p3}, nodes: []*v1.Node{node1, node5}, expectedEvictedPodCount: 0, - strategy: api.DeschedulerStrategy{Params: &api.StrategyParameters{NodeFit: true}}, + nodeFit: true, }, } @@ -304,7 +316,23 @@ func TestFindDuplicatePods(t *testing.T) { false, ) - RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor, getPodsAssignedToNode) + plugin, err := New(&framework.RemoveDuplicatePodsArg{ + // Namespaces *api.Namespaces + // PriorityThreshold *api.PriorityThreshold + NodeFit: testCase.nodeFit, + ExcludeOwnerKinds: testCase.excludeOwnerKinds, + }, + frameworkHandle{ + clientset: fakeClient, + podEvictor: podEvictor, + getPodsAssignedToNodeFunc: getPodsAssignedToNode, + }, + ) + if err != nil { + t.Fatalf("Unable to initialize the plugin: %v", err) + } + + plugin.(interface{}).(framework.DeschedulePlugin).Deschedule(ctx, testCase.nodes) podsEvicted := podEvictor.TotalEvicted() if podsEvicted != testCase.expectedEvictedPodCount { t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted) @@ -731,7 +759,18 @@ func TestRemoveDuplicatesUniformly(t *testing.T) { false, ) - RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, testCase.nodes, podEvictor, getPodsAssignedToNode) + plugin, err := New(&framework.RemoveDuplicatePodsArg{}, + frameworkHandle{ + clientset: fakeClient, + podEvictor: podEvictor, + getPodsAssignedToNodeFunc: getPodsAssignedToNode, + }, + ) + if err != nil { + t.Fatalf("Unable to initialize the plugin: %v", err) + } + + plugin.(interface{}).(framework.DeschedulePlugin).Deschedule(ctx, testCase.nodes) podsEvicted := podEvictor.TotalEvicted() if podsEvicted != testCase.expectedEvictedPodCount { t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted) diff --git a/pkg/framework/types.go b/pkg/framework/types.go new file mode 100644 index 0000000000..c3aedd6915 --- /dev/null +++ b/pkg/framework/types.go @@ -0,0 +1,50 @@ +package framework + +import ( + "context" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientset "k8s.io/client-go/kubernetes" + + "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/descheduler/evictions" + podutil "sigs.k8s.io/descheduler/pkg/descheduler/pod" +) + +type Handle interface { + // ClientSet returns a kubernetes clientSet. + ClientSet() clientset.Interface + + PodEvictor() *evictions.PodEvictor + GetPodsAssignedToNodeFunc() podutil.GetPodsAssignedToNodeFunc +} + +type Status struct { + err error +} + +// Plugin is the parent type for all the descheduling framework plugins. +type Plugin interface { + Name() string +} + +type DeschedulePlugin interface { + Deschedule(ctx context.Context, nodes []*v1.Node) *Status +} + +// RemoveDuplicatePodsArg holds arguments used to configure the RemoveDuplicatePods plugin. +type RemoveDuplicatePodsArg struct { + metav1.TypeMeta + + Namespaces *api.Namespaces + PriorityThreshold *api.PriorityThreshold + NodeFit bool + ExcludeOwnerKinds []string +} + +// TODO(jchaloup): have this go generated +func (in *RemoveDuplicatePodsArg) DeepCopyObject() runtime.Object { + return nil +} diff --git a/pkg/utils/priority.go b/pkg/utils/priority.go index 709273dfcf..058de5ea87 100644 --- a/pkg/utils/priority.go +++ b/pkg/utils/priority.go @@ -72,3 +72,23 @@ func GetPriorityFromStrategyParams(ctx context.Context, client clientset.Interfa } return } + +// GetPriorityValueFromPriorityThreshold gets priority from the given PriorityThreshold. +// It will return SystemCriticalPriority by default. +func GetPriorityValueFromPriorityThreshold(ctx context.Context, client clientset.Interface, priorityThreshold *api.PriorityThreshold) (priority int32, err error) { + if priorityThreshold == nil { + return SystemCriticalPriority, nil + } + if priorityThreshold.Value != nil { + priority = *priorityThreshold.Value + } else { + priority, err = GetPriorityFromPriorityClass(ctx, client, priorityThreshold.Name) + if err != nil { + return 0, fmt.Errorf("unable to get priority value from the priority class: %v", err) + } + } + if priority > SystemCriticalPriority { + return 0, fmt.Errorf("priority threshold can't be greater than %d", SystemCriticalPriority) + } + return +}