[automatic failover] Replace 'CircuitBreaker' with 'Cluster' for 'CircuitBreakerFailoverBase.clusterFailover' (#4275)

atakavci · web-flow · commit e972c219ce4f · 2025-09-11T09:29:00.000+03:00
* - replace CircuitBreaker with Cluster for CircuitBreakerFailoverBase.clusterFailover
- improve thread safety with provider initialization

* - formatting
diff --git a/src/main/java/redis/clients/jedis/mcf/CircuitBreakerCommandExecutor.java b/src/main/java/redis/clients/jedis/mcf/CircuitBreakerCommandExecutor.java
@@ -38,7 +38,7 @@ public <T> T executeCommand(CommandObject<T> commandObject) {
     supplier.withCircuitBreaker(cluster.getCircuitBreaker());
     supplier.withRetry(cluster.getRetry());
     supplier.withFallback(provider.getFallbackExceptionList(),
-      e -> this.handleClusterFailover(commandObject, cluster.getCircuitBreaker()));
+      e -> this.handleClusterFailover(commandObject, cluster));
 
     return supplier.decorate().get();
   }
@@ -73,10 +73,9 @@ private boolean isActiveCluster(Cluster cluster) {
    * Functional interface wrapped in retry and circuit breaker logic to handle open circuit breaker
    * failure scenarios
    */
-  private <T> T handleClusterFailover(CommandObject<T> commandObject,
-      CircuitBreaker circuitBreaker) {
+  private <T> T handleClusterFailover(CommandObject<T> commandObject, Cluster cluster) {
 
-    clusterFailover(circuitBreaker);
+    clusterFailover(cluster);
 
     // Recursive call to the initiating method so the operation can be retried on the next cluster
     // connection
diff --git a/src/main/java/redis/clients/jedis/mcf/CircuitBreakerFailoverBase.java b/src/main/java/redis/clients/jedis/mcf/CircuitBreakerFailoverBase.java
@@ -38,9 +38,10 @@ public void close() {
    * Functional interface wrapped in retry and circuit breaker logic to handle open circuit breaker
    * failure scenarios
    */
-  protected void clusterFailover(CircuitBreaker circuitBreaker) {
+  protected void clusterFailover(Cluster cluster) {
     lock.lock();
 
+    CircuitBreaker circuitBreaker = cluster.getCircuitBreaker();
     try {
       // Check state to handle race conditions since iterateActiveCluster() is
       // non-idempotent
@@ -52,19 +53,17 @@ protected void clusterFailover(CircuitBreaker circuitBreaker) {
 
         Cluster activeCluster = provider.getCluster();
         // This should be possible only if active cluster is switched from by other reasons than
-        // circuit
-        // breaker, just before circuit breaker triggers
-        if (activeCluster.getCircuitBreaker() != circuitBreaker) {
+        // circuit breaker, just before circuit breaker triggers
+        if (activeCluster != cluster) {
           return;
         }
 
-        activeCluster.setGracePeriod();
+        cluster.setGracePeriod();
         circuitBreaker.transitionToForcedOpenState();
 
         // Iterating the active cluster will allow subsequent calls to the executeCommand() to use
         // the next
         // cluster's connection pool - according to the configuration's prioritization/order/weight
-        // int activeMultiClusterIndex = provider.incrementActiveMultiClusterIndex1();
         provider.iterateActiveCluster(SwitchReason.CIRCUIT_BREAKER);
       }
       // this check relies on the fact that many failover attempts can hit with the same CB,
@@ -73,13 +72,12 @@ protected void clusterFailover(CircuitBreaker circuitBreaker) {
       // different than
       // active CB. If its the same one and there are no more clusters to failover to, then throw an
       // exception
-      else if (circuitBreaker == provider.getCluster().getCircuitBreaker()
-          && !provider.canIterateOnceMore()) {
-            throw new JedisConnectionException(
-                "Cluster/database endpoint could not failover since the MultiClusterClientConfig was not "
-                    + "provided with an additional cluster/database endpoint according to its prioritized sequence. "
-                    + "If applicable, consider failing back OR restarting with an available cluster/database endpoint");
-          }
+      else if (cluster == provider.getCluster() && !provider.canIterateOnceMore()) {
+        throw new JedisConnectionException(
+            "Cluster/database endpoint could not failover since the MultiClusterClientConfig was not "
+                + "provided with an additional cluster/database endpoint according to its prioritized sequence. "
+                + "If applicable, consider failing back OR restarting with an available cluster/database endpoint");
+      }
       // Ignore exceptions since we are already in a failure state
     } finally {
       lock.unlock();
diff --git a/src/main/java/redis/clients/jedis/mcf/CircuitBreakerFailoverConnectionProvider.java b/src/main/java/redis/clients/jedis/mcf/CircuitBreakerFailoverConnectionProvider.java
@@ -31,7 +31,7 @@ public Connection getConnection() {
     supplier.withRetry(cluster.getRetry());
     supplier.withCircuitBreaker(cluster.getCircuitBreaker());
     supplier.withFallback(provider.getFallbackExceptionList(),
-      e -> this.handleClusterFailover(cluster.getCircuitBreaker()));
+      e -> this.handleClusterFailover(cluster));
 
     return supplier.decorate().get();
   }
@@ -49,9 +49,9 @@ private Connection handleGetConnection(Cluster cluster) {
    * Functional interface wrapped in retry and circuit breaker logic to handle open circuit breaker
    * failure scenarios
    */
-  private Connection handleClusterFailover(CircuitBreaker circuitBreaker) {
+  private Connection handleClusterFailover(Cluster cluster) {
 
-    clusterFailover(circuitBreaker);
+    clusterFailover(cluster);
 
     // Recursive call to the initiating method so the operation can be retried on the next cluster
     // connection
diff --git a/src/main/java/redis/clients/jedis/providers/MultiClusterPooledConnectionProvider.java b/src/main/java/redis/clients/jedis/providers/MultiClusterPooledConnectionProvider.java
@@ -80,7 +80,7 @@ public class MultiClusterPooledConnectionProvider implements ConnectionProvider
    */
   private volatile Cluster activeCluster;
 
-  private final Lock activeClusterIndexLock = new ReentrantLock(true);
+  private final Lock activeClusterChangeLock = new ReentrantLock(true);
 
   /**
    * Functional interface for listening to cluster switch events. The event args contain the reason
@@ -183,7 +183,13 @@ public MultiClusterPooledConnectionProvider(MultiClusterClientConfig multiCluste
     // Mark initialization as complete - handleHealthStatusChange can now process events
     initializationComplete = true;
     if (!activeCluster.isHealthy()) {
-      activeCluster = waitForInitialHealthyCluster(statusTracker);
+      // Race condition: Direct assignment to 'activeCluster' is not thread safe because
+      // 'onHealthStatusChange' may execute concurrently once 'initializationComplete'
+      // is set to true.
+      // Simple rule is to never assign value of 'activeCluster' outside of 
+      // 'activeClusterChangeLock' once the 'initializationComplete' is done. 
+      waitForInitialHealthyCluster(statusTracker);
+      iterateActiveCluster(SwitchReason.HEALTH_CHECK);
     }
     this.fallbackExceptionList = multiClusterClientConfig.getFallbackExceptionList();
 
@@ -211,11 +217,11 @@ public void add(ClusterConfig clusterConfig) {
           "Endpoint " + endpoint + " already exists in the provider");
     }
 
-    activeClusterIndexLock.lock();
+    activeClusterChangeLock.lock();
     try {
       addClusterInternal(multiClusterClientConfig, clusterConfig);
     } finally {
-      activeClusterIndexLock.unlock();
+      activeClusterChangeLock.unlock();
     }
   }
 
@@ -240,7 +246,7 @@ public void remove(Endpoint endpoint) {
     }
     log.debug("Removing endpoint {}", endpoint);
 
-    activeClusterIndexLock.lock();
+    activeClusterChangeLock.lock();
     try {
       Cluster clusterToRemove = multiClusterMap.get(endpoint);
       boolean isActiveCluster = (activeCluster == clusterToRemove);
@@ -273,7 +279,7 @@ public void remove(Endpoint endpoint) {
         clusterToRemove.close();
       }
     } finally {
-      activeClusterIndexLock.unlock();
+      activeClusterChangeLock.unlock();
     }
   }
 
@@ -542,7 +548,7 @@ private boolean setActiveCluster(Cluster cluster, boolean validateConnection) {
     // Cluster cluster = clusterEntry.getValue();
     // Field-level synchronization is used to avoid the edge case in which
     // incrementActiveMultiClusterIndex() is called at the same time
-    activeClusterIndexLock.lock();
+    activeClusterChangeLock.lock();
     Cluster oldCluster;
     try {
 
@@ -563,7 +569,7 @@ private boolean setActiveCluster(Cluster cluster, boolean validateConnection) {
       oldCluster = activeCluster;
       activeCluster = cluster;
     } finally {
-      activeClusterIndexLock.unlock();
+      activeClusterChangeLock.unlock();
     }
     boolean switched = oldCluster != cluster;
     if (switched && this.multiClusterClientConfig.isFastFailover()) {