diff --git a/pkg/scheduler/framework/plugins/clustereviction/cluster_eviction.go b/pkg/scheduler/framework/plugins/clustereviction/cluster_eviction.go new file mode 100644 index 000000000000..db891b37c34a --- /dev/null +++ b/pkg/scheduler/framework/plugins/clustereviction/cluster_eviction.go @@ -0,0 +1,42 @@ +package clustereviction + +import ( + "context" + + "k8s.io/klog/v2" + + clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" + workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2" + "github.com/karmada-io/karmada/pkg/scheduler/framework" + "github.com/karmada-io/karmada/pkg/util/helper" +) + +const ( + // Name is the name of the plugin used in the plugin registry and configurations. + Name = "ClusterEviction" +) + +// ClusterEviction is a plugin that checks if the target cluster is in the GracefulEvictionTasks which means it has been evicted. +type ClusterEviction struct{} + +var _ framework.FilterPlugin = &ClusterEviction{} + +// New instantiates the APIEnablement plugin. +func New() (framework.Plugin, error) { + return &ClusterEviction{}, nil +} + +// Name returns the plugin name. +func (p *ClusterEviction) Name() string { + return Name +} + +// Filter checks if the target cluster is in the GracefulEvictionTasks which means it has been evicted. +func (p *ClusterEviction) Filter(_ context.Context, bindingSpec *workv1alpha2.ResourceBindingSpec, _ *workv1alpha2.ResourceBindingStatus, cluster *clusterv1alpha1.Cluster) *framework.Result { + if helper.CheckIfClusterEvicted(bindingSpec.GracefulEvictionTasks, cluster.Name) { + klog.V(2).Infof("Cluster(%s) has been evicted before.", cluster.Name) + return framework.NewResult(framework.Unschedulable, "cluster(s) has been evicted before") + } + + return framework.NewResult(framework.Success) +} diff --git a/pkg/scheduler/framework/plugins/registry.go b/pkg/scheduler/framework/plugins/registry.go index 21ae4377a760..1de83dc724ac 100644 --- a/pkg/scheduler/framework/plugins/registry.go +++ b/pkg/scheduler/framework/plugins/registry.go @@ -3,6 +3,7 @@ package plugins import ( "github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/apienablement" "github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/clusteraffinity" + "github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/clustereviction" "github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/clusterlocality" "github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/spreadconstraint" "github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/tainttoleration" @@ -17,5 +18,6 @@ func NewInTreeRegistry() runtime.Registry { clusteraffinity.Name: clusteraffinity.New, spreadconstraint.Name: spreadconstraint.New, clusterlocality.Name: clusterlocality.New, + clustereviction.Name: clustereviction.New, } } diff --git a/pkg/util/helper/cluster.go b/pkg/util/helper/cluster.go index 229c140c1543..7f2dc4be8ee3 100644 --- a/pkg/util/helper/cluster.go +++ b/pkg/util/helper/cluster.go @@ -2,6 +2,7 @@ package helper import ( clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" + workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2" ) // IsAPIEnabled checks if target API (or CRD) referencing by groupVersion and kind has been installed. @@ -21,3 +22,14 @@ func IsAPIEnabled(APIEnablements []clusterv1alpha1.APIEnablement, groupVersion s return false } + +// CheckIfClusterEvicted checks if the target cluster is in the GracefulEvictionTasks which means it has been evicted. +func CheckIfClusterEvicted(gracefulEvictionTasks []workv1alpha2.GracefulEvictionTask, clusterName string) bool { + for _, task := range gracefulEvictionTasks { + if task.FromCluster == clusterName { + return true + } + } + + return false +} diff --git a/pkg/util/helper/cluster_test.go b/pkg/util/helper/cluster_test.go index badef95a38d3..d2f328cd44c4 100644 --- a/pkg/util/helper/cluster_test.go +++ b/pkg/util/helper/cluster_test.go @@ -4,6 +4,7 @@ import ( "testing" clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" + workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2" ) func TestIsAPIEnabled(t *testing.T) { @@ -72,3 +73,48 @@ func TestIsAPIEnabled(t *testing.T) { }) } } + +func TestCheckIfClusterEvicted(t *testing.T) { + gracefulEvictionTasks := []workv1alpha2.GracefulEvictionTask{ + { + FromCluster: ClusterMember1, + Producer: workv1alpha2.EvictionProducerTaintManager, + Reason: workv1alpha2.EvictionReasonTaintUntolerated, + }, + { + FromCluster: ClusterMember2, + Producer: workv1alpha2.EvictionProducerTaintManager, + Reason: workv1alpha2.EvictionReasonTaintUntolerated, + }, + } + + tests := []struct { + name string + gracefulEvictionTasks []workv1alpha2.GracefulEvictionTask + targetCluster string + expect bool + }{ + { + name: "targetCluster has been evicted", + gracefulEvictionTasks: gracefulEvictionTasks, + targetCluster: ClusterMember1, + expect: true, + }, + { + name: "targetCluster has been not evicted", + gracefulEvictionTasks: gracefulEvictionTasks, + targetCluster: ClusterMember3, + expect: false, + }, + } + + for _, test := range tests { + tc := test + t.Run(tc.name, func(t *testing.T) { + result := CheckIfClusterEvicted(tc.gracefulEvictionTasks, tc.targetCluster) + if result != tc.expect { + t.Errorf("expected: %v, but got: %v", tc.expect, result) + } + }) + } +} diff --git a/test/e2e/failover_test.go b/test/e2e/failover_test.go index fcde5ed6debc..d0a04eb29527 100644 --- a/test/e2e/failover_test.go +++ b/test/e2e/failover_test.go @@ -13,6 +13,7 @@ import ( "k8s.io/apimachinery/pkg/util/rand" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/klog/v2" + "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" @@ -141,6 +142,106 @@ var _ = framework.SerialDescribe("failover testing", func() { }) }) }) + + ginkgo.Context("Taint cluster testing", func() { + var policyNamespace, policyName string + var deploymentNamespace, deploymentName string + var deployment *appsv1.Deployment + var taint corev1.Taint + var maxGroups, minGroups, numOfFailedClusters int + var policy *policyv1alpha1.PropagationPolicy + maxGroups = 1 + minGroups = 1 + numOfFailedClusters = 1 + + ginkgo.BeforeEach(func() { + policyNamespace = testNamespace + policyName = deploymentNamePrefix + rand.String(RandomStrLength) + deploymentNamespace = testNamespace + deploymentName = policyName + deployment = testhelper.NewDeployment(deploymentNamespace, deploymentName) + + policy = testhelper.NewPropagationPolicy(policyNamespace, policyName, []policyv1alpha1.ResourceSelector{ + { + APIVersion: deployment.APIVersion, + Kind: deployment.Kind, + Name: deployment.Name, + }, + }, policyv1alpha1.Placement{ + ClusterAffinity: &policyv1alpha1.ClusterAffinity{ + ClusterNames: framework.ClusterNames(), + }, + ClusterTolerations: []corev1.Toleration{ + { + Key: "fail-test", + Effect: corev1.TaintEffectNoExecute, + Operator: corev1.TolerationOpExists, + TolerationSeconds: pointer.Int64(3), + }, + }, + SpreadConstraints: []policyv1alpha1.SpreadConstraint{ + { + SpreadByField: policyv1alpha1.SpreadByFieldCluster, + MaxGroups: maxGroups, + MinGroups: minGroups, + }, + }, + }) + + taint = corev1.Taint{ + Key: "fail-test", + Effect: corev1.TaintEffectNoExecute, + } + }) + + ginkgo.BeforeEach(func() { + framework.CreatePropagationPolicy(karmadaClient, policy) + framework.CreateDeployment(kubeClient, deployment) + ginkgo.DeferCleanup(func() { + framework.RemoveDeployment(kubeClient, deployment.Namespace, deployment.Name) + framework.RemovePropagationPolicy(karmadaClient, policy.Namespace, policy.Name) + }) + }) + + ginkgo.It("taint cluster", func() { + var disabledClusters []string + targetClusterNames := framework.ExtractTargetClustersFrom(controlPlaneClient, deployment) + ginkgo.By("taint one cluster", func() { + temp := numOfFailedClusters + for _, targetClusterName := range targetClusterNames { + if temp > 0 { + klog.Infof("Taint one cluster(%s).", targetClusterName) + err := taintCluster(controlPlaneClient, targetClusterName, taint) + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + disabledClusters = append(disabledClusters, targetClusterName) + temp-- + } + } + }) + + ginkgo.By("check whether deployment of failed cluster is rescheduled to other available cluster", func() { + gomega.Eventually(func() int { + targetClusterNames = framework.ExtractTargetClustersFrom(controlPlaneClient, deployment) + for _, targetClusterName := range targetClusterNames { + // the target cluster should be overwritten to another available cluster + if !testhelper.IsExclude(targetClusterName, disabledClusters) { + return 0 + } + } + + return len(targetClusterNames) + }, pollTimeout, pollInterval).Should(gomega.Equal(minGroups)) + }) + + ginkgo.By("recover not ready cluster", func() { + for _, disabledCluster := range disabledClusters { + fmt.Printf("cluster %s is waiting for recovering\n", disabledCluster) + err := recoverTaintedCluster(controlPlaneClient, disabledCluster, taint) + gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) + } + }) + }) + }) }) // disableCluster will set wrong API endpoint of current cluster @@ -167,6 +268,49 @@ func disableCluster(c client.Client, clusterName string) error { return err } +// taintCluster will taint cluster +func taintCluster(c client.Client, clusterName string, taint corev1.Taint) error { + err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) { + clusterObj := &clusterv1alpha1.Cluster{} + if err := c.Get(context.TODO(), client.ObjectKey{Name: clusterName}, clusterObj); err != nil { + if apierrors.IsConflict(err) { + return false, nil + } + return false, err + } + clusterObj.Spec.Taints = append(clusterObj.Spec.Taints, taint) + if err := c.Update(context.TODO(), clusterObj); err != nil { + if apierrors.IsConflict(err) { + return false, nil + } + return false, err + } + return true, nil + }) + return err +} + +// recoverTaintedCluster will recover the taint of the disabled cluster +func recoverTaintedCluster(c client.Client, clusterName string, taint corev1.Taint) error { + err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) { + clusterObj := &clusterv1alpha1.Cluster{} + if err := c.Get(context.TODO(), client.ObjectKey{Name: clusterName}, clusterObj); err != nil { + if apierrors.IsConflict(err) { + return false, nil + } + return false, err + } + if err := helper.UpdateClusterControllerTaint(context.TODO(), c, nil, []*corev1.Taint{&taint}, clusterObj); err != nil { + if apierrors.IsConflict(err) { + return false, nil + } + return false, err + } + return true, nil + }) + return err +} + // recoverCluster will recover API endpoint of the disable cluster func recoverCluster(c client.Client, clusterName string, originalAPIEndpoint string) error { err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {