Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a clusterEviction plugin #3469

Merged
merged 1 commit into from
Apr 28, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package clustereviction

import (
"context"

"k8s.io/klog/v2"

clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
"github.com/karmada-io/karmada/pkg/scheduler/framework"
"github.com/karmada-io/karmada/pkg/util/helper"
)

const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = "ClusterEviction"
)

// ClusterEviction is a plugin that checks if the target cluster is in the GracefulEvictionTasks which means it is in the process of eviction.
type ClusterEviction struct{}

var _ framework.FilterPlugin = &ClusterEviction{}

// New instantiates the ClusterEviction plugin.
func New() (framework.Plugin, error) {
return &ClusterEviction{}, nil
}

// Name returns the plugin name.
func (p *ClusterEviction) Name() string {
return Name
}

// Filter checks if the target cluster is in the GracefulEvictionTasks which means it is in the process of eviction.
func (p *ClusterEviction) Filter(_ context.Context, bindingSpec *workv1alpha2.ResourceBindingSpec, _ *workv1alpha2.ResourceBindingStatus, cluster *clusterv1alpha1.Cluster) *framework.Result {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this plugin only take effect when the Application Failover feature is enabled?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not only application failover, but also the current cluster failover. See comments: #3456 (comment).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this conflict with the TaintToleration plugin?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there will be any conflict. The plugins will execute sequentially, ensuring that the final result does not include the cluster in eviction tasks, as intended.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok.

if helper.ClusterInGracefulEvictionTasks(bindingSpec.GracefulEvictionTasks, cluster.Name) {
klog.V(2).Infof("Cluster(%s) is in the process of eviction.", cluster.Name)
return framework.NewResult(framework.Unschedulable, "cluster(s) is in the process of eviction")
}

return framework.NewResult(framework.Success)
}
2 changes: 2 additions & 0 deletions pkg/scheduler/framework/plugins/registry.go
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@ package plugins
import (
"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/apienablement"
"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/clusteraffinity"
"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/clustereviction"
"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/clusterlocality"
"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/spreadconstraint"
"github.com/karmada-io/karmada/pkg/scheduler/framework/plugins/tainttoleration"
@@ -17,5 +18,6 @@ func NewInTreeRegistry() runtime.Registry {
clusteraffinity.Name: clusteraffinity.New,
spreadconstraint.Name: spreadconstraint.New,
clusterlocality.Name: clusterlocality.New,
clustereviction.Name: clustereviction.New,
}
}
12 changes: 12 additions & 0 deletions pkg/util/helper/cluster.go
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@ package helper

import (
clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
)

// IsAPIEnabled checks if target API (or CRD) referencing by groupVersion and kind has been installed.
@@ -21,3 +22,14 @@ func IsAPIEnabled(APIEnablements []clusterv1alpha1.APIEnablement, groupVersion s

return false
}

// ClusterInGracefulEvictionTasks checks if the target cluster is in the GracefulEvictionTasks which means it is in the process of eviction.
func ClusterInGracefulEvictionTasks(gracefulEvictionTasks []workv1alpha2.GracefulEvictionTask, clusterName string) bool {
for _, task := range gracefulEvictionTasks {
if task.FromCluster == clusterName {
return true
}
}

return false
}
46 changes: 46 additions & 0 deletions pkg/util/helper/cluster_test.go
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@ import (
"testing"

clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2"
)

func TestIsAPIEnabled(t *testing.T) {
@@ -72,3 +73,48 @@ func TestIsAPIEnabled(t *testing.T) {
})
}
}

func TestClusterInGracefulEvictionTasks(t *testing.T) {
gracefulEvictionTasks := []workv1alpha2.GracefulEvictionTask{
{
FromCluster: ClusterMember1,
Producer: workv1alpha2.EvictionProducerTaintManager,
Reason: workv1alpha2.EvictionReasonTaintUntolerated,
},
{
FromCluster: ClusterMember2,
Producer: workv1alpha2.EvictionProducerTaintManager,
Reason: workv1alpha2.EvictionReasonTaintUntolerated,
},
}

tests := []struct {
name string
gracefulEvictionTasks []workv1alpha2.GracefulEvictionTask
targetCluster string
expect bool
}{
{
name: "targetCluster is in the process of eviction",
gracefulEvictionTasks: gracefulEvictionTasks,
targetCluster: ClusterMember1,
expect: true,
},
{
name: "targetCluster is not in the process of eviction",
gracefulEvictionTasks: gracefulEvictionTasks,
targetCluster: ClusterMember3,
expect: false,
},
}

for _, test := range tests {
tc := test
t.Run(tc.name, func(t *testing.T) {
result := ClusterInGracefulEvictionTasks(tc.gracefulEvictionTasks, tc.targetCluster)
if result != tc.expect {
t.Errorf("expected: %v, but got: %v", tc.expect, result)
}
})
}
}
144 changes: 144 additions & 0 deletions test/e2e/failover_test.go
Original file line number Diff line number Diff line change
@@ -13,6 +13,7 @@ import (
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/utils/pointer"
"sigs.k8s.io/controller-runtime/pkg/client"

clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1"
@@ -141,6 +142,106 @@ var _ = framework.SerialDescribe("failover testing", func() {
})
})
})

ginkgo.Context("Taint cluster testing", func() {
var policyNamespace, policyName string
var deploymentNamespace, deploymentName string
var deployment *appsv1.Deployment
var taint corev1.Taint
var maxGroups, minGroups, numOfFailedClusters int
var policy *policyv1alpha1.PropagationPolicy
maxGroups = 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maxGroups, minGroups, numOfFailedClusters := 1,1,1

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/cc @XiShanYongYe-Chang. I follow the former code style. To be honest, I do not know which style is better.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just a code style, any writing is OK. Keep the overall consistency will be nice.

minGroups = 1
numOfFailedClusters = 1

ginkgo.BeforeEach(func() {
policyNamespace = testNamespace
policyName = deploymentNamePrefix + rand.String(RandomStrLength)
deploymentNamespace = testNamespace
deploymentName = policyName
deployment = testhelper.NewDeployment(deploymentNamespace, deploymentName)

policy = testhelper.NewPropagationPolicy(policyNamespace, policyName, []policyv1alpha1.ResourceSelector{
{
APIVersion: deployment.APIVersion,
Kind: deployment.Kind,
Name: deployment.Name,
},
}, policyv1alpha1.Placement{
ClusterAffinity: &policyv1alpha1.ClusterAffinity{
ClusterNames: framework.ClusterNames(),
},
ClusterTolerations: []corev1.Toleration{
{
Key: "fail-test",
Effect: corev1.TaintEffectNoExecute,
Operator: corev1.TolerationOpExists,
TolerationSeconds: pointer.Int64(3),
},
},
SpreadConstraints: []policyv1alpha1.SpreadConstraint{
{
SpreadByField: policyv1alpha1.SpreadByFieldCluster,
MaxGroups: maxGroups,
MinGroups: minGroups,
},
},
})

taint = corev1.Taint{
Key: "fail-test",
Effect: corev1.TaintEffectNoExecute,
}
})

ginkgo.BeforeEach(func() {
framework.CreatePropagationPolicy(karmadaClient, policy)
framework.CreateDeployment(kubeClient, deployment)
ginkgo.DeferCleanup(func() {
framework.RemoveDeployment(kubeClient, deployment.Namespace, deployment.Name)
framework.RemovePropagationPolicy(karmadaClient, policy.Namespace, policy.Name)
})
})

ginkgo.It("taint cluster", func() {
var disabledClusters []string
targetClusterNames := framework.ExtractTargetClustersFrom(controlPlaneClient, deployment)
ginkgo.By("taint one cluster", func() {
temp := numOfFailedClusters
for _, targetClusterName := range targetClusterNames {
if temp > 0 {
klog.Infof("Taint one cluster(%s).", targetClusterName)
err := taintCluster(controlPlaneClient, targetClusterName, taint)
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
disabledClusters = append(disabledClusters, targetClusterName)
temp--
}
}
})

ginkgo.By("check whether deployment of failed cluster is rescheduled to other available cluster", func() {
gomega.Eventually(func() int {
targetClusterNames = framework.ExtractTargetClustersFrom(controlPlaneClient, deployment)
for _, targetClusterName := range targetClusterNames {
// the target cluster should be overwritten to another available cluster
if !testhelper.IsExclude(targetClusterName, disabledClusters) {
return 0
}
}

return len(targetClusterNames)
}, pollTimeout, pollInterval).Should(gomega.Equal(minGroups))
})

ginkgo.By("recover not ready cluster", func() {
for _, disabledCluster := range disabledClusters {
fmt.Printf("cluster %s is waiting for recovering\n", disabledCluster)
err := recoverTaintedCluster(controlPlaneClient, disabledCluster, taint)
gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
}
})
})
})
})

// disableCluster will set wrong API endpoint of current cluster
@@ -167,6 +268,49 @@ func disableCluster(c client.Client, clusterName string) error {
return err
}

// taintCluster will taint cluster
func taintCluster(c client.Client, clusterName string, taint corev1.Taint) error {
err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {
clusterObj := &clusterv1alpha1.Cluster{}
if err := c.Get(context.TODO(), client.ObjectKey{Name: clusterName}, clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
clusterObj.Spec.Taints = append(clusterObj.Spec.Taints, taint)
if err := c.Update(context.TODO(), clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
return true, nil
})
return err
}

// recoverTaintedCluster will recover the taint of the disabled cluster
func recoverTaintedCluster(c client.Client, clusterName string, taint corev1.Taint) error {
err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {
clusterObj := &clusterv1alpha1.Cluster{}
if err := c.Get(context.TODO(), client.ObjectKey{Name: clusterName}, clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
if err := helper.UpdateClusterControllerTaint(context.TODO(), c, nil, []*corev1.Taint{&taint}, clusterObj); err != nil {
if apierrors.IsConflict(err) {
return false, nil
}
return false, err
}
return true, nil
})
return err
}

// recoverCluster will recover API endpoint of the disable cluster
func recoverCluster(c client.Client, clusterName string, originalAPIEndpoint string) error {
err := wait.PollImmediate(pollInterval, pollTimeout, func() (done bool, err error) {