From d28caf1422c83ab6eebfead121fc1315eec52cd9 Mon Sep 17 00:00:00 2001 From: xiaozhuang <61875985+xiaozhuang-a@users.noreply.github.com> Date: Mon, 16 Dec 2024 21:29:16 +0800 Subject: [PATCH] fix: redis-cluster unexpected downscaling (#1167) (#1173) Signed-off-by: xiaozhuang Co-authored-by: xiaozhuang --- .../rediscluster/rediscluster_controller.go | 10 +++++++++- pkg/k8sutils/cluster-scaling.go | 18 +++++++++++++----- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/pkg/controllers/rediscluster/rediscluster_controller.go b/pkg/controllers/rediscluster/rediscluster_controller.go index 0c79a2a56..76d51f2be 100644 --- a/pkg/controllers/rediscluster/rediscluster_controller.go +++ b/pkg/controllers/rediscluster/rediscluster_controller.go @@ -73,6 +73,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu // Check if the cluster is downscaled if leaderCount := k8sutils.CheckRedisNodeCount(ctx, r.K8sClient, instance, "leader"); leaderReplicas < leaderCount { + if !(r.IsStatefulSetReady(ctx, instance.Namespace, instance.Name+"-leader") && r.IsStatefulSetReady(ctx, instance.Namespace, instance.Name+"-follower")) { + return intctrlutil.Reconciled() + } + logger.Info("Redis cluster is downscaling...", "Current.LeaderReplicas", leaderCount, "Desired.LeaderReplicas", leaderReplicas) for shardIdx := leaderCount - 1; shardIdx >= leaderReplicas; shardIdx-- { logger.Info("Remove the shard", "Shard.Index", shardIdx) @@ -83,7 +87,11 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu // lastLeaderPod is slaving right now Make it the master Pod // We have to bring a manual failover here to make it a leaderPod // clusterFailover should also include the clusterReplicate since we have to map the followers to new leader - k8sutils.ClusterFailover(ctx, r.K8sClient, instance) + logger.Info("Cluster Failover is initiated", "Shard.Index", shardIdx) + if err = k8sutils.ClusterFailover(ctx, r.K8sClient, instance); err != nil { + logger.Error(err, "Failed to initiate cluster failover") + return intctrlutil.RequeueWithError(ctx, err, "") + } } // Step 1 Remove the Follower Node k8sutils.RemoveRedisFollowerNodesFromCluster(ctx, r.K8sClient, instance) diff --git a/pkg/k8sutils/cluster-scaling.go b/pkg/k8sutils/cluster-scaling.go index b2bd5a0da..3e7cd7097 100644 --- a/pkg/k8sutils/cluster-scaling.go +++ b/pkg/k8sutils/cluster-scaling.go @@ -391,7 +391,7 @@ func verifyLeaderPodInfo(ctx context.Context, redisClient *redis.Client, podName return false } -func ClusterFailover(ctx context.Context, client kubernetes.Interface, cr *redisv1beta2.RedisCluster) { +func ClusterFailover(ctx context.Context, client kubernetes.Interface, cr *redisv1beta2.RedisCluster) error { slavePodName := cr.Name + "-leader-" + strconv.Itoa(int(CheckRedisNodeCount(ctx, client, cr, "leader"))-1) // cmd = redis-cli cluster failover -a var cmd []string @@ -400,13 +400,15 @@ func ClusterFailover(ctx context.Context, client kubernetes.Interface, cr *redis Namespace: cr.Namespace, } - cmd = []string{"redis-cli", "cluster", "failover"} + cmd = []string{"redis-cli", "-h"} if *cr.Spec.ClusterVersion == "v7" { - cmd = append(cmd, getRedisHostname(pod, cr, "leader")+fmt.Sprintf(":%d", *cr.Spec.Port)) + cmd = append(cmd, getRedisHostname(pod, cr, "leader")) } else { - cmd = append(cmd, getRedisServerAddress(ctx, client, pod, *cr.Spec.Port)) + cmd = append(cmd, getRedisServerIP(ctx, client, pod)) } + cmd = append(cmd, "-p") + cmd = append(cmd, strconv.Itoa(*cr.Spec.Port)) if cr.Spec.KubernetesConfig.ExistingPasswordSecret != nil { pass, err := getRedisPassword(ctx, client, cr.Namespace, *cr.Spec.KubernetesConfig.ExistingPasswordSecret.Name, *cr.Spec.KubernetesConfig.ExistingPasswordSecret.Key) @@ -418,7 +420,13 @@ func ClusterFailover(ctx context.Context, client kubernetes.Interface, cr *redis } cmd = append(cmd, getRedisTLSArgs(cr.Spec.TLS, slavePodName)...) + cmd = append(cmd, "cluster", "failover") log.FromContext(ctx).V(1).Info("Redis cluster failover command is", "Command", cmd) - executeCommand(ctx, client, cr, cmd, slavePodName) + execOut, err := executeCommand1(ctx, client, cr, cmd, slavePodName) + if err != nil { + log.FromContext(ctx).Error(err, "Could not execute command", "Command", cmd, "Output", execOut) + return err + } + return nil }