Skip to content

Commit 8f2f195

Browse files
authored
[automatic failover] Implement max number of failover attempts (#4293)
* - implement max failover attempt - add tests * - fix user receive the intended exception * -clean+format * - java doc for exceptions * format * - more tests on excaption types in max failover attempts mechanism * format * fix failing timing in test * disable health checks * rename to switchToHealthyCluster * format
1 parent 0d8e184 commit 8f2f195

10 files changed

+549
-58
lines changed

src/main/java/redis/clients/jedis/MultiClusterClientConfig.java

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,12 @@ public static interface StrategySupplier {
161161
/** Default grace period in milliseconds to keep clusters disabled after they become unhealthy. */
162162
private static final long GRACE_PERIOD_DEFAULT = 10000;
163163

164+
/** Default maximum number of failover attempts. */
165+
private static final int MAX_NUM_FAILOVER_ATTEMPTS_DEFAULT = 10;
166+
167+
/** Default delay in milliseconds between failover attempts. */
168+
private static final int DELAY_IN_BETWEEN_FAILOVER_ATTEMPTS_DEFAULT = 12000;
169+
164170
/** Array of cluster configurations defining the available Redis endpoints and their settings. */
165171
private final ClusterConfig[] clusterConfigs;
166172

@@ -485,6 +491,34 @@ public static interface StrategySupplier {
485491
*/
486492
private boolean fastFailover;
487493

494+
/**
495+
* Maximum number of failover attempts.
496+
* <p>
497+
* This setting controls how many times the system will attempt to failover to a different cluster
498+
* before giving up. For example, if set to 3, the system will make 1 initial attempt plus 2
499+
* failover attempts for a total of 3 attempts.
500+
* </p>
501+
* <p>
502+
* <strong>Default:</strong> {@value #MAX_NUM_FAILOVER_ATTEMPTS_DEFAULT}
503+
* </p>
504+
* @see #getMaxNumFailoverAttempts()
505+
*/
506+
private int maxNumFailoverAttempts;
507+
508+
/**
509+
* Delay in milliseconds between failover attempts.
510+
* <p>
511+
* This setting controls how long the system will wait before attempting to failover to a
512+
* different cluster. For example, if set to 1000, the system will wait 1 second before attempting
513+
* to failover to a different cluster.
514+
* </p>
515+
* <p>
516+
* <strong>Default:</strong> {@value #DELAY_IN_BETWEEN_FAILOVER_ATTEMPTS_DEFAULT} milliseconds
517+
* </p>
518+
* @see #getDelayInBetweenFailoverAttempts()
519+
*/
520+
private int delayInBetweenFailoverAttempts;
521+
488522
/**
489523
* Constructs a new MultiClusterClientConfig with the specified cluster configurations.
490524
* <p>
@@ -679,6 +713,25 @@ public long getGracePeriod() {
679713
return gracePeriod;
680714
}
681715

716+
/**
717+
* Returns the maximum number of failover attempts.
718+
* @return maximum number of failover attempts
719+
* @see #maxNumFailoverAttempts
720+
*/
721+
public int getMaxNumFailoverAttempts() {
722+
return maxNumFailoverAttempts;
723+
724+
}
725+
726+
/**
727+
* Returns the delay in milliseconds between failover attempts.
728+
* @return delay in milliseconds between failover attempts
729+
* @see #delayInBetweenFailoverAttempts
730+
*/
731+
public int getDelayInBetweenFailoverAttempts() {
732+
return delayInBetweenFailoverAttempts;
733+
}
734+
682735
/**
683736
* Returns whether connections are forcefully terminated during failover.
684737
* @return true if fast failover is enabled, false for graceful failover
@@ -1090,6 +1143,12 @@ public static class Builder {
10901143
/** Whether to forcefully terminate connections during failover. */
10911144
private boolean fastFailover = false;
10921145

1146+
/** Maximum number of failover attempts. */
1147+
private int maxNumFailoverAttempts = MAX_NUM_FAILOVER_ATTEMPTS_DEFAULT;
1148+
1149+
/** Delay in milliseconds between failover attempts. */
1150+
private int delayInBetweenFailoverAttempts = DELAY_IN_BETWEEN_FAILOVER_ATTEMPTS_DEFAULT;
1151+
10931152
/**
10941153
* Constructs a new Builder with the specified cluster configurations.
10951154
* @param clusterConfigs array of cluster configurations defining available Redis endpoints
@@ -1539,6 +1598,42 @@ public Builder fastFailover(boolean fastFailover) {
15391598
return this;
15401599
}
15411600

1601+
/**
1602+
* Sets the maximum number of failover attempts.
1603+
* <p>
1604+
* This setting controls how many times the system will attempt to failover to a different
1605+
* cluster before giving up. For example, if set to 3, the system will make 1 initial attempt
1606+
* plus 2 failover attempts for a total of 3 attempts.
1607+
* </p>
1608+
* <p>
1609+
* <strong>Default:</strong> {@value #MAX_NUM_FAILOVER_ATTEMPTS_DEFAULT}
1610+
* </p>
1611+
* @param maxNumFailoverAttempts maximum number of failover attempts
1612+
* @return this builder instance for method chaining
1613+
*/
1614+
public Builder maxNumFailoverAttempts(int maxNumFailoverAttempts) {
1615+
this.maxNumFailoverAttempts = maxNumFailoverAttempts;
1616+
return this;
1617+
}
1618+
1619+
/**
1620+
* Sets the delay in milliseconds between failover attempts.
1621+
* <p>
1622+
* This setting controls how long the system will wait before attempting to failover to a
1623+
* different cluster. For example, if set to 1000, the system will wait 1 second before
1624+
* attempting to failover to a different cluster.
1625+
* </p>
1626+
* <p>
1627+
* <strong>Default:</strong> {@value #DELAY_IN_BETWEEN_FAILOVER_ATTEMPTS_DEFAULT} milliseconds
1628+
* </p>
1629+
* @param delayInBetweenFailoverAttempts delay in milliseconds between failover attempts
1630+
* @return this builder instance for method chaining
1631+
*/
1632+
public Builder delayInBetweenFailoverAttempts(int delayInBetweenFailoverAttempts) {
1633+
this.delayInBetweenFailoverAttempts = delayInBetweenFailoverAttempts;
1634+
return this;
1635+
}
1636+
15421637
/**
15431638
* Builds and returns a new MultiClusterClientConfig instance with all configured settings.
15441639
* <p>
@@ -1576,6 +1671,8 @@ public MultiClusterClientConfig build() {
15761671
config.failbackCheckInterval = this.failbackCheckInterval;
15771672
config.gracePeriod = this.gracePeriod;
15781673
config.fastFailover = this.fastFailover;
1674+
config.maxNumFailoverAttempts = this.maxNumFailoverAttempts;
1675+
config.delayInBetweenFailoverAttempts = this.delayInBetweenFailoverAttempts;
15791676

15801677
return config;
15811678
}

src/main/java/redis/clients/jedis/mcf/CircuitBreakerCommandExecutor.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import redis.clients.jedis.CommandObject;
88
import redis.clients.jedis.Connection;
99
import redis.clients.jedis.annots.Experimental;
10+
import redis.clients.jedis.exceptions.JedisConnectionException;
1011
import redis.clients.jedis.executors.CommandExecutor;
1112
import redis.clients.jedis.mcf.MultiClusterPooledConnectionProvider.Cluster;
1213

@@ -46,7 +47,14 @@ public <T> T executeCommand(CommandObject<T> commandObject) {
4647
* Functional interface wrapped in retry and circuit breaker logic to handle happy path scenarios
4748
*/
4849
private <T> T handleExecuteCommand(CommandObject<T> commandObject, Cluster cluster) {
49-
try (Connection connection = cluster.getConnection()) {
50+
Connection connection;
51+
try {
52+
connection = cluster.getConnection();
53+
} catch (JedisConnectionException e) {
54+
provider.assertOperability();
55+
throw e;
56+
}
57+
try {
5058
return connection.executeCommand(commandObject);
5159
} catch (Exception e) {
5260
if (cluster.retryOnFailover() && !isActiveCluster(cluster)
@@ -56,6 +64,8 @@ && isCircuitBreakerTrackedException(e, cluster.getCircuitBreaker())) {
5664
}
5765

5866
throw e;
67+
} finally {
68+
connection.close();
5969
}
6070
}
6171

src/main/java/redis/clients/jedis/mcf/CircuitBreakerFailoverBase.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,19 +63,16 @@ protected void clusterFailover(Cluster cluster) {
6363
// Iterating the active cluster will allow subsequent calls to the executeCommand() to use
6464
// the next
6565
// cluster's connection pool - according to the configuration's prioritization/order/weight
66-
provider.iterateActiveCluster(SwitchReason.CIRCUIT_BREAKER);
66+
provider.switchToHealthyCluster(SwitchReason.CIRCUIT_BREAKER, cluster);
6767
}
6868
// this check relies on the fact that many failover attempts can hit with the same CB,
6969
// only the first one will trigger a failover, and make the CB FORCED_OPEN.
7070
// when the rest reaches here, the active cluster is already the next one, and should be
7171
// different than
7272
// active CB. If its the same one and there are no more clusters to failover to, then throw an
7373
// exception
74-
else if (cluster == provider.getCluster() && !provider.canIterateOnceMore()) {
75-
throw new JedisConnectionException(
76-
"Cluster/database endpoint could not failover since the MultiClusterClientConfig was not "
77-
+ "provided with an additional cluster/database endpoint according to its prioritized sequence. "
78-
+ "If applicable, consider failing back OR restarting with an available cluster/database endpoint");
74+
else if (cluster == provider.getCluster()) {
75+
provider.switchToHealthyCluster(SwitchReason.CIRCUIT_BREAKER, cluster);
7976
}
8077
// Ignore exceptions since we are already in a failure state
8178
} finally {
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package redis.clients.jedis.mcf;
2+
3+
import redis.clients.jedis.exceptions.JedisConnectionException;
4+
5+
/**
6+
* Exception thrown when a failover attempt fails due to lack of available/healthy clusters.
7+
* <p>
8+
* This exception itself is not thrown, see the child exceptions for more details.
9+
* </p>
10+
* @see JedisFailoverException.JedisPermanentlyNotAvailableException
11+
* @see JedisFailoverException.JedisTemporarilyNotAvailableException
12+
*/
13+
public class JedisFailoverException extends JedisConnectionException {
14+
private static final String MESSAGE = "Cluster/database endpoint could not failover since the MultiClusterClientConfig was not "
15+
+ "provided with an additional cluster/database endpoint according to its prioritized sequence. "
16+
+ "If applicable, consider falling back OR restarting with an available cluster/database endpoint";
17+
18+
public JedisFailoverException(String s) {
19+
super(s);
20+
}
21+
22+
public JedisFailoverException() {
23+
super(MESSAGE);
24+
}
25+
26+
/**
27+
* Exception thrown when a failover attempt fails due to lack of available/healthy clusters, and
28+
* the max number of failover attempts has been exceeded. And there is still no healthy cluster.
29+
* <p>
30+
* See the configuration properties
31+
* {@link redis.clients.jedis.MultiClusterClientConfig#maxNumFailoverAttempts} and
32+
* {@link redis.clients.jedis.MultiClusterClientConfig#delayInBetweenFailoverAttempts} for more
33+
* details.
34+
*/
35+
public static class JedisPermanentlyNotAvailableException extends JedisFailoverException {
36+
public JedisPermanentlyNotAvailableException(String s) {
37+
super(s);
38+
}
39+
40+
public JedisPermanentlyNotAvailableException() {
41+
super();
42+
}
43+
}
44+
45+
/**
46+
* Exception thrown when a failover attempt fails due to lack of available/healthy clusters, but
47+
* the max number of failover attempts has not been exceeded yet. Though there is no healthy
48+
* cluster including the selected/current one, given configuration suggests that it should be a
49+
* temporary condition and it is possible that there will be a healthy cluster available.
50+
* <p>
51+
* See the configuration properties
52+
* {@link redis.clients.jedis.MultiClusterClientConfig#maxNumFailoverAttempts} and
53+
* {@link redis.clients.jedis.MultiClusterClientConfig#delayInBetweenFailoverAttempts} for more
54+
* details.
55+
*/
56+
public static class JedisTemporarilyNotAvailableException extends JedisFailoverException {
57+
58+
public JedisTemporarilyNotAvailableException(String s) {
59+
super(s);
60+
}
61+
62+
public JedisTemporarilyNotAvailableException() {
63+
super();
64+
}
65+
}
66+
}

0 commit comments

Comments
 (0)