From dc946b4e6daef098e3f6f575f2d8f4c62bd2b356 Mon Sep 17 00:00:00 2001 From: Bartosz Sypytkowski Date: Mon, 13 Nov 2017 13:09:56 +0100 Subject: [PATCH] Split brain resolver (#3180) * Split Brain Resolver: initial commit * initial implementation of split brain resolver * composite approach to split brain strategy * split brain strategy specs * configuration spec * MNTK spec for KeepMajority * fixed problem with MNTK false negatives * split brain resolver docs --- .../clustering/split-brain-resolver.md | 206 +++++++++++ docs/articles/toc.yml | 2 + .../CoreAPISpec.ApproveCluster.approved.txt | 7 + .../SplitBrainResolverDowningSpec.cs | 117 ++++++ .../SplitBrainResolverConfigSpec.cs | 122 +++++++ .../SplitBrainStrategySpec.cs | 233 ++++++++++++ src/core/Akka.Cluster/AutoDown.cs | 77 ++-- .../Akka.Cluster/Configuration/Cluster.conf | 38 ++ src/core/Akka.Cluster/SplitBrainResolver.cs | 344 ++++++++++++++++++ 9 files changed, 1098 insertions(+), 48 deletions(-) create mode 100644 docs/articles/clustering/split-brain-resolver.md create mode 100644 src/core/Akka.Cluster.Tests.MultiNode/SplitBrainResolverDowningSpec.cs create mode 100644 src/core/Akka.Cluster.Tests/SplitBrainResolverConfigSpec.cs create mode 100644 src/core/Akka.Cluster.Tests/SplitBrainStrategySpec.cs create mode 100644 src/core/Akka.Cluster/SplitBrainResolver.cs diff --git a/docs/articles/clustering/split-brain-resolver.md b/docs/articles/clustering/split-brain-resolver.md new file mode 100644 index 00000000000..31eab427c06 --- /dev/null +++ b/docs/articles/clustering/split-brain-resolver.md @@ -0,0 +1,206 @@ +--- +uid: split-brain-resolver +title: Split Brain Resolver +--- +# Split Brain Resolver + +> Note: while this feature is based on [Lightbend Reactive Platform Split Brain Resolver](https://doc.akka.io/docs/akka/rp-16s01p02/scala/split-brain-resolver.html) feature description, however its implementation is a result of free contribution and interpretation of Akka.NET team. Lightbend doesn't take any responsibility for the state and correctness of it. + +When working with an Akka.NET cluster, you must consider how to handle [network partitions](https://en.wikipedia.org/wiki/Network_partition) (a.k.a. split brain scenarios) and machine crashes (including .NET CLR/Core and hardware failures). This is crucial for correct behavior of your cluster, especially if you use Cluster Singleton or Cluster Sharding. + +## The Problem + +One of the common problems present in distributed systems are potential hardware failures. Things like garbage collection pauses, machine crashes or network partitions happen all the time. Moreover it is impossible to distinguish between them. Different cause can have different result on our cluster. A careful balance here is highly desired: + +- From one side we may want to detect crashed nodes as fast as possible and remove them from the cluster. +- However, things like network partitions may be only temporary. For this reason it may be more feasible to wait a while for disconnected nodes in hope, that they will be able to reconnect soon. + +Networks partitions also bring different problems - the natural result of such event is a risk of splitting a single cluster into two or more independent ones, unaware of each others existence. This comes with certain risks. Even more, some of the Akka.NET cluster features may be unavailable or malfunctioning in such scenario. + +To solve this kind of problems we need to determine a common strategy, in which every node will come to the same deterministic conclusion about which node should live and which one should die, even if it won't be able to communicate with others. + +Since Akka.NET cluster is working in peer-to-peer mode, it means that there is no single *global* entity which is able to arbitrary define one true state of the cluster. Instead each node has so called failure detector, which tracks the responsiveness and checks health of other connected nodes. This allows us to create a *local* node perspective on the overall cluster state. + +In the past the only available opt-in stategy was an auto-down, in which each node was automatically downing others after reaching a certain period of unreachability. While this approach was enough to react on machine crashes, it was failing in face of network partitions: if cluster was split into two or more parts due to network connectivity issues, each one of them would simply consider others as down. This would lead to having several independent clusters not knowning about each other. It is especially disastrous in case of Cluster Singleton and Cluster Sharding features, both relying on having only one actor instance living in the cluster at the same time. + +Split brain resolver feature brings ability to apply different strategies for managing node lifecycle in face of network issues and machine crashes. It works as a custom downing provider. Therefore in order to use it, **all of your Akka.NET cluster nodes must define it with the same configuration**. Here's how minimal configuration looks like: + +```hocon +akka.cluster { + downing-provider-class = "Akka.Cluster.SplitBrainResolver, Akka.Cluster" + split-brain-resolver { + active-strategy = + } +} +``` + +Keep in mind that split brain resolver will NOT work when `akka.cluster.auto-down-unreachable-after` is used. + + +## Strategies + +This section describes the different split brain resolver strategies. Please keep in mind, that there's no universal solution and each one of them may fail under specific circumstances. + +To decide which strategy to use, you can set `akka.cluster.split-brain-resolver.active-strategy` to one of 4 different options: + +- `static-quorum` +- `keep-majority` +- `keep-oldest` +- `keep-referee` + +All strategies will be applied only after cluster state has reached stability for specified time treshold (no nodes transitioning between different states for some time), specified by `stable-after` setting. Nodes which are joining will not affect this treshold, as they won't be promoted to UP status in face unreachable nodes. For the same reason they won't be taken into account, when a strategy will be applied. + +```hocon +akka.cluster.split-brain-resolver { + # Enable one of the available strategies (see descriptions below): + # static-quorum, keep-majority, keep-oldest, keep-referee + active-strategy = off + + # Decision is taken by the strategy when there has been no membership or + # reachability changes for this duration, i.e. the cluster state is stable. + stable-after = 20s +} +``` + +There is no simple way to decide the value of `stable-after`, as shorter value will give you the faster reaction time for unreachable nodes at cost of higher risk of false failures detected - due to temporary network issues. The rule of thumb for this setting is to set `stable-after` to `log10(maxExpectedNumberOfNodes) * 10`. + +Remember that if a strategy will decide to down a particular node, it won't shutdown the related `ActorSystem`. In order to do so, use cluster removal hook like this: + +```csharp +Cluster.Get(system).RegisterOnMemberRemoved(() => { + system.Terminate().Wait(); +}); +``` + +### Static Quorum + +The `static-quorum` strategy works well, when you are able to define minimum required cluster size. It will down unreachable nodes if the number of reachable ones is greater than or equal to a configured `quorum-size`. Otherwise reachable ones will be downed. + +When to use it? When you have a cluster with fixed size of nodes or fixed size of nodes with specific role. + +Things to keep in mind: + +1. If cluster will split into more than 2 parts, each one smaller than the `quorum-size`, this strategy may bring down the whole cluster. +2. If the cluster will grow 2 times beyond `quorum-size`, there is still a potential risk of having cluster splitting into two if a network partition will occur. +3. If during cluster initialization some nodes will become unreachable, there is a risk of putting the cluster down - since strategy will apply before cluster will reach quorum size. For this reason it's a good thing to define `akka.cluster.min-nr-of-members` to a higher value than actual `quorum-size`. +4. Don't forget to add new nodes back once some of them were removed. + +This strategy can work over a subset of cluster nodes by defining a specific `role`. This is useful when some of your nodes are more important than others and you can prioritize them during quorum check. You can also use it to to configure a "core" set of nodes, while still being free grow your cluster over initial limit. Of course this will leave your cluster more vulnerable in situation where those "core" nodes will fail. + +Configuration: + +```hocon +akka.cluster.split-brain-resolver { + active-strategy = static-quorum + + static-quorum { + # minimum number of nodes that the cluster must have + quorum-size = undefined + + # if the 'role' is defined the decision is based only on members with that 'role' + role = "" + } +} +``` + +### Keep Majority + +The `keep-majority` strategy will down this part of the cluster, which sees a lesser part of the whole cluster. This choice is made based on the latest known state of the cluster. When cluster will split into two equal parts, the one which contains the lowest address, will survive. + +When to use it? When your cluster can grow or shrink very dynamically. + +Keep in mind, that: + +1. Two parts of the cluster may make their decision based on the different state of the cluster, as it's relative for each node. In practice, the risk of it is quite small. +2. If there are more than 2 partitions, and none of them has reached the majority, the whole cluster may go down. +3. If more than half of the cluster nodes will go down at once, the remaining ones will also down themselves, as they didn't reached the majority (based on the last known cluster state). + +Just like in the case of static quorum, you may decide to make decisions based only on a nodes having configured `role`. The advantages here are similar to those of the static quorum. + +Configuration: + +```hocon +akka.cluster.split-brain-resolver { + active-strategy = keep-majority + + keep-majority { + # if the 'role' is defined the decision is based only on members with that 'role' + role = "" + } +} +``` + +### Keep Oldest + +The `keep-oldest` strategy, when a network split has happened, will down a part of the cluster which doesn't contain the oldest node. + +When to use it? This approach is particularly good in combination with Cluster Singleton, which usually is running on the oldest cluster member. It's also usefull, when you have a one starter node configured as `akka.cluster.seed-nodes` for others, which will still allow you to add and remove members using its address. + +Keep in mind, that: + +1. When the oldest node will get partitioned from others, it will be downed itself and the next oldest one will pick up its role. This is possible thanks to `down-if-alone` setting. +2. If `down-if-alone` option will be set to `off`, a whole cluster will be dependent on the availability of this single node. +3. There is a risk, that if partition will split cluster into two unequal parts i.e. 2 nodes with the oldest one present and 20 remaining ones, the majority of the cluster will go down. +4. Since the oldest node is determined on the latest known state of the cluster, there is a small risk that during partition, two parts of the cluster will both consider themselves having the oldest member on their side. While this is very rare situation, you still may end up having two independent clusters after split occurrence. + +Just like in previous cases, a `role` setting can be used to detemine the oldest member across all having specified role. + +Configuration: + +```hocon +akka.cluster.split-brain-resolver { + active-strategy = keep-oldest + + keep-oldest { + # Enable downing of the oldest node when it is partitioned from all other nodes + down-if-alone = on + + # if the 'role' is defined the decision is based only on members with that 'role', + # i.e. using the oldest member (singleton) within the nodes with that role + role = "" + } +} +``` + +### Keep Referee + +The `keep-referee` strategy will simply down the part that does not contain the given referee node. + +When to use it? If you have a single node which is running processes crucial to existence of the entire cluster. + +Things to keep in mind: + +1. With this strategy, cluster will never split into two indenpendent ones, under any circumstances. +2. A referee node is a single point of failure for the cluster. + +You can configure a minimum required amount of reachable nodes to maintain operability by using `down-all-if-less-than-nodes`. If a strategy will detect that the number of reachable nodes will go below that minimun it will down the entire partition even when referee node was reachable. + +Configuration: + +```hocon +akka.cluster.split-brain-resolver { + active-strategy = keep-referee + + keep-referee { + # referee address on the form of "akka.tcp://system@hostname:port" + address = "" + down-all-if-less-than-nodes = 1 + } +} +``` + +## Relation to Cluster Singleton and Cluster Sharding + +Cluster singleton actors and sharded entities of cluster sharding have their lifecycle managed automatically. This means that there can be only one instance of a target actor at the same time in the cluster, and when detected dead, it will be resurrected on another node. However it's important the the old instance of the actor must be stopped before new one will be spawned, especially when used together will Akka.Persistence module. Otherwise this may result in corruption of actor's persistent state and violate actor state consistency. + +Since different nodes may apply their split brain decisions at different points in time, it may be good to configure a time margin necessary to make sure, that other nodes will get enough time to apply their strategies. This can be done using `akka.cluster.down-removal-margin` setting. The shorter it is, the faster reaction time of your cluster will be. However it will also increase the risk of having multiple singleton/sharded entity instances at the same time. It's recommended to set this value to be equal `stable-after` option described above. + +### Expected Failover Time + +If you're going to use a split brain resolver, you can see that the total failover latency is determined by several values. Defaults are: + +- failure detection 5 seconds +- `akka.cluster.split-brain-resolver.stable-after` 20 seconds +- `akka.cluster.down-removal-margin` 20 seconds + +This would result in total failover time of 45 seconds. While this value is good for the cluster of 100 nodes, you may decide to lower those values in case of a smaller one i.e. cluster of 20 nodes could work well with timeouts of 13s, which would reduce total failover time to 31 seconds. \ No newline at end of file diff --git a/docs/articles/toc.yml b/docs/articles/toc.yml index 59668234eda..8efb71b37e6 100644 --- a/docs/articles/toc.yml +++ b/docs/articles/toc.yml @@ -162,6 +162,8 @@ href: clustering/cluster-sharding.md - name: Distributed Data href: clustering/distributed-data.md + - name: Split Brain Resolver + href: clustering/split-brain-resolver.md - name: Utilities items: - name: Event Bus diff --git a/src/core/Akka.API.Tests/CoreAPISpec.ApproveCluster.approved.txt b/src/core/Akka.API.Tests/CoreAPISpec.ApproveCluster.approved.txt index fa378e35f29..5a554f77937 100644 --- a/src/core/Akka.API.Tests/CoreAPISpec.ApproveCluster.approved.txt +++ b/src/core/Akka.API.Tests/CoreAPISpec.ApproveCluster.approved.txt @@ -231,6 +231,13 @@ namespace Akka.Cluster public Akka.Actor.Props DowningActorProps { get; } public System.TimeSpan DownRemovalMargin { get; } } + public sealed class SplitBrainResolver : Akka.Cluster.IDowningProvider + { + public SplitBrainResolver(Akka.Actor.ActorSystem system) { } + public Akka.Actor.Props DowningActorProps { get; } + public System.TimeSpan DownRemovalMargin { get; } + public System.TimeSpan StableAfter { get; } + } public class UniqueAddress : System.IComparable, System.IComparable, System.IEquatable { public UniqueAddress(Akka.Actor.Address address, int uid) { } diff --git a/src/core/Akka.Cluster.Tests.MultiNode/SplitBrainResolverDowningSpec.cs b/src/core/Akka.Cluster.Tests.MultiNode/SplitBrainResolverDowningSpec.cs new file mode 100644 index 00000000000..d8593cfadda --- /dev/null +++ b/src/core/Akka.Cluster.Tests.MultiNode/SplitBrainResolverDowningSpec.cs @@ -0,0 +1,117 @@ +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2016 Lightbend Inc. +// Copyright (C) 2013-2016 Akka.NET project +// +//----------------------------------------------------------------------- + +using System; +using System.Collections.Immutable; +using System.Linq; +using Akka.Cluster.TestKit; +using Akka.Configuration; +using Akka.Remote.TestKit; +using Akka.Remote.Transport; +using FluentAssertions; + +namespace Akka.Cluster.Tests.MultiNode +{ + public sealed class SplitBrainDowningSpecConfig : MultiNodeConfig + { + public RoleName First { get; } + public RoleName Second { get; } + public RoleName Third { get; } + public RoleName Fourth { get; } + public RoleName Fifth { get; } + + public SplitBrainDowningSpecConfig() + { + First = Role("first"); + Second = Role("second"); + Third = Role("third"); + Fourth = Role("fourth"); + Fifth = Role("fifth"); + + TestTransport = true; + CommonConfig = DebugConfig(false) + .WithFallback(ConfigurationFactory.ParseString(@" + akka { + cluster { + down-removal-margin = 1s + downing-provider-class = ""Akka.Cluster.SplitBrainResolver, Akka.Cluster"" + split-brain-resolver { + stable-after = 1s + active-strategy = keep-majority + } + } + }")) + .WithFallback(MultiNodeClusterSpec.ClusterConfig()); + } + } + + public class SplitBrainResolverDowningSpec : MultiNodeClusterSpec + { + private readonly SplitBrainDowningSpecConfig _config; + + public SplitBrainResolverDowningSpec() : this(new SplitBrainDowningSpecConfig()) + { + } + + protected SplitBrainResolverDowningSpec(SplitBrainDowningSpecConfig config) : base(config, typeof(SplitBrainResolverDowningSpec)) + { + _config = config; + } + + [MultiNodeFact] + public void SplitBrainKeepMajorityDowningSpec() + { + A_Cluster_of_5_nodes_must_reach_initial_convergence(); + A_Cluster_must_detect_network_partition_and_down_minor_part_of_the_cluster(); + } + + private void A_Cluster_of_5_nodes_must_reach_initial_convergence() + { + AwaitClusterUp(Roles.ToArray()); + EnterBarrier("after-1"); + } + + private void A_Cluster_must_detect_network_partition_and_down_minor_part_of_the_cluster() + { + var majority = new[] { _config.First, _config.Second, _config.Third }; + var minority = new[] { _config.Fourth, _config.Fifth }; + + EnterBarrier("before-split"); + + var downed = false; + + RunOn(() => + { + Cluster.RegisterOnMemberRemoved(() => downed = true); + }, minority); + + RunOn(() => + { + foreach (var a in majority) + foreach (var b in minority) + TestConductor.Blackhole(a, b, ThrottleTransportAdapter.Direction.Both).Wait(); + }, _config.First); + + EnterBarrier("after-split"); + + RunOn(() => + { + // side with majority of the nodes must stay up + AwaitMembersUp(majority.Length, canNotBePartOfMemberRing: minority.Select(GetAddress).ToImmutableHashSet()); + AssertLeader(majority); + }, majority); + + RunOn(() => + { + // side with majority of the nodes must stay up, minority must go down + AwaitAssert(() => downed.Should().BeTrue("cluster node on-removed hook has been triggered")); + }, minority); + + EnterBarrier("after-2"); + } + } +} \ No newline at end of file diff --git a/src/core/Akka.Cluster.Tests/SplitBrainResolverConfigSpec.cs b/src/core/Akka.Cluster.Tests/SplitBrainResolverConfigSpec.cs new file mode 100644 index 00000000000..0c180ba9794 --- /dev/null +++ b/src/core/Akka.Cluster.Tests/SplitBrainResolverConfigSpec.cs @@ -0,0 +1,122 @@ +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2016 Lightbend Inc. +// Copyright (C) 2013-2016 Akka.NET project +// +//----------------------------------------------------------------------- + +using System; +using Akka.Actor; +using Akka.Configuration; +using FluentAssertions; +using Xunit; + +namespace Akka.Cluster.Tests +{ + public class SplitBrainResolverConfigSpec + { + [Fact] + public void StaticQuorum_must_parse_settings_from_config() + { + var config = ConfigurationFactory.ParseString(@" + quorum-size = 5 + role = test-role"); + var strategy = new StaticQuorum(config); + + strategy.QuorumSize.Should().Be(5); + strategy.Role.Should().Be("test-role"); + } + + [Fact] + public void KeepMajority_must_parse_settings_from_config() + { + var config = ConfigurationFactory.ParseString(@" + role = test-role"); + var strategy = new KeepMajority(config); + + strategy.Role.Should().Be("test-role"); + } + + [Fact] + public void KeepOldest_must_parse_settings_from_config() + { + var config = ConfigurationFactory.ParseString(@" + down-if-alone = true + role = test-role"); + var strategy = new KeepOldest(config); + + strategy.Role.Should().Be("test-role"); + strategy.DownIfAlone.Should().Be(true); + } + + [Fact] + public void KeepReferee_must_parse_settings_from_config() + { + var config = ConfigurationFactory.ParseString(@" + address = ""akka.tcp://system@localhost:5000/"" + down-all-if-less-than-nodes = 3"); + var strategy = new KeepReferee(config); + + strategy.Role.Should().BeNull(); + strategy.DownAllIfLessThanNodes.Should().Be(3); + strategy.Address.Should().Be(Address.Parse("akka.tcp://system@localhost:5000/")); + } + + [Fact] + public void SplitBrainResolver_should_be_picked_up_if_set_as_downing_provider() + { + var config = ConfigurationFactory.ParseString(@" + akka { + actor.provider = cluster + cluster { + down-removal-margin = 10s + downing-provider-class = ""Akka.Cluster.SplitBrainResolver, Akka.Cluster"" + split-brain-resolver { + stable-after = 40s + active-strategy = static-quorum + static-quorum.quorum-size = 3 + } + } + }"); + + using (var system = ActorSystem.Create("system", config)) + { + var cluster = Cluster.Get(system); + + cluster.DowningProvider.Should().BeOfType(); + var provider = (SplitBrainResolver)cluster.DowningProvider; + + provider.Strategy.Should().BeOfType(); + provider.DownRemovalMargin.Should().Be(TimeSpan.FromSeconds(10)); + provider.StableAfter.Should().Be(TimeSpan.FromSeconds(40)); + } + } + + [Fact] + public void SplitBrainResolver_should_work_with_default_timeouts() + { + var config = ConfigurationFactory.ParseString(@" + akka { + actor.provider = cluster + cluster { + downing-provider-class = ""Akka.Cluster.SplitBrainResolver, Akka.Cluster"" + split-brain-resolver { + active-strategy = keep-majority + } + } + }"); + + using (var system = ActorSystem.Create("system", config)) + { + var cluster = Cluster.Get(system); + + cluster.DowningProvider.Should().BeOfType(); + var provider = (SplitBrainResolver)cluster.DowningProvider; + + provider.Strategy.Should().BeOfType(); + provider.DownRemovalMargin.Should().Be(TimeSpan.Zero); // default is zero + provider.StableAfter.Should().Be(TimeSpan.FromSeconds(20)); // default is 20s + } + } + } +} \ No newline at end of file diff --git a/src/core/Akka.Cluster.Tests/SplitBrainStrategySpec.cs b/src/core/Akka.Cluster.Tests/SplitBrainStrategySpec.cs new file mode 100644 index 00000000000..fe24d042ff6 --- /dev/null +++ b/src/core/Akka.Cluster.Tests/SplitBrainStrategySpec.cs @@ -0,0 +1,233 @@ +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2016 Lightbend Inc. +// Copyright (C) 2013-2016 Akka.NET project +// +//----------------------------------------------------------------------- + +using System; +using System.Collections.Immutable; +using Akka.Actor; +using Akka.TestKit; +using Akka.Util; +using FluentAssertions; +using Xunit; + +namespace Akka.Cluster.Tests +{ + public class SplitBrainStrategySpec + { + private static Member Member(Address address, int upNumber = 1, MemberStatus status = MemberStatus.Up, string role = null) => + new Member(new UniqueAddress(address, ThreadLocalRandom.Current.Next()), upNumber, status, role == null ? ImmutableHashSet.Empty : ImmutableHashSet.Create(role)); + + private static ImmutableSortedSet Members(params Member[] members) => ImmutableSortedSet.CreateRange(Akka.Cluster.Member.AgeOrdering, members); + + private static readonly Address a = new Address("akka.tcp", "system", "localhost", 10000); + private static readonly Address b = new Address("akka.tcp", "system", "localhost", 10001); + private static readonly Address c = new Address("akka.tcp", "system", "localhost", 10002); + private static readonly Address d = new Address("akka.tcp", "system", "localhost", 10003); + private static readonly Address e = new Address("akka.tcp", "system", "localhost", 10004); + private static readonly Address f = new Address("akka.tcp", "system", "localhost", 10005); + + [Fact] + public void StaticQuorum_must_down_unreachable_nodes_if_remaining_size_is_equal_quorum_size() + { + var unreachable = Members(Member(e)); + var remaining = Members(Member(a), Member(b), Member(c), Member(d)); + + var strategy = new StaticQuorum(quorumSize: 4, role: null); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void StaticQuorum_must_down_unreachable_nodes_if_remaining_size_is_greater_quorum_size() + { + var unreachable = Members(Member(e)); + var remaining = Members(Member(a), Member(b), Member(c), Member(d)); + + var strategy = new StaticQuorum(quorumSize: 3, role: null); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + + [Fact] + public void StaticQuorum_must_down_remaining_nodes_if_remaining_size_is_less_than_quorum_size() + { + var unreachable = Members(Member(e), Member(d)); + var remaining = Members(Member(a), Member(b), Member(c)); + + var strategy = new StaticQuorum(quorumSize: 4, role: null); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(remaining); + } + + [Fact] + public void StaticQuorum_must_limit_node_counts_to_role_if_provided() + { + const string role = "test"; + var unreachable = Members(Member(e), Member(d)); + var remaining = Members(Member(a, role: role), Member(b, role: role), Member(c)); + + var strategy = new StaticQuorum(quorumSize: 3, role: role); + // quorum size is 3, but only 2 remaining nodes have configured role + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(remaining); + } + + [Fact] + public void KeepMajority_must_down_unreachable_nodes_if_remaining_nodes_have_majority() + { + var unreachable = Members(Member(e), Member(d)); + var remaining = Members(Member(a), Member(b), Member(c)); + + var strategy = new KeepMajority(); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void KeepMajority_must_down_remaining_nodes_if_unreachable_nodes_have_majority() + { + var unreachable = Members(Member(e), Member(d), Member(c)); + var remaining = Members(Member(a), Member(b)); + + var strategy = new KeepMajority(); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(remaining); + } + + [Fact] + public void KeepMajority_must_keep_the_part_with_the_lowest_nodes_address_in_case_of_equal_size() + { + var unreachable = Members(Member(e), Member(d)); + var remaining = Members(Member(a), Member(b)); // `a` is the lowest address + + var strategy = new KeepMajority(); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void KeepMajority_must_down_unreachable_nodes_if_remaining_nodes_have_majority_role_based() + { + const string role = "test"; + var unreachable = Members(Member(e, role: role), Member(d), Member(c)); + var remaining = Members(Member(a, role: role), Member(b, role: role)); + + var strategy = new KeepMajority(role); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void KeepMajority_must_down_remaining_nodes_if_unreachable_nodes_have_majority_role_based() + { + const string role = "test"; + var unreachable = Members(Member(e, role: role), Member(d, role: role)); + var remaining = Members(Member(a, role: role), Member(b), Member(c)); + + var strategy = new KeepMajority(role); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(remaining); + } + + [Fact] + public void KeepMajority_must_keep_the_part_with_the_lowest_nodes_address_in_case_of_equal_size_role_based() + { + const string role = "test"; + var unreachable = Members(Member(e, role: role), Member(d, role: role)); + var remaining = Members(Member(a, role: role), Member(b, role: role), Member(c)); // `a` is the lowest address + + var strategy = new KeepMajority(role); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void KeepOldest_must_down_remaining_if_oldest_was_unreachable() + { + var unreachable = Members(Member(e, upNumber: 1), Member(d, upNumber: 5)); + var remaining = Members(Member(a, upNumber: 2), Member(b, upNumber: 3), Member(c, upNumber: 4)); + + var strategy = new KeepOldest(downIfAlone: false); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(remaining); + } + + [Fact] + public void KeepOldest_must_down_unreachable_nodes_if_oldest_was_found_in_remaining() + { + var unreachable = Members(Member(e, upNumber: 2), Member(d, upNumber: 5)); + var remaining = Members(Member(a, upNumber: 1), Member(b, upNumber: 3), Member(c, upNumber: 4)); + + var strategy = new KeepOldest(downIfAlone: false); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void KeepOldest_must_down_remaining_if_oldest_was_unreachable_role_based() + { + const string role = "test"; + var unreachable = Members(Member(e, upNumber: 2, role: role), Member(d, upNumber: 5)); + var remaining = Members(Member(a, upNumber: 1), Member(b, upNumber: 3, role: role), Member(c, upNumber: 4, role: role)); + + var strategy = new KeepOldest(downIfAlone: false, role: role); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(remaining); + } + + [Fact] + public void KeepOldest_must_down_unreachable_nodes_if_oldest_was_found_in_remaining_role_based() + { + const string role = "test"; + var unreachable = Members(Member(e, upNumber: 1), Member(d, upNumber: 3, role: role)); + var remaining = Members(Member(a, upNumber: 2, role: role), Member(b, upNumber: 5, role: role), Member(c, upNumber: 4)); + + var strategy = new KeepOldest(downIfAlone: false, role: role); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void KeepOldest_when_downIfAlone_must_down_oldest_if_it_was_the_only_unreachable_node() + { + var unreachable = Members(Member(e, upNumber: 1)); + var remaining = Members(Member(a, upNumber: 2), Member(b, upNumber: 3), Member(c, upNumber: 4)); + + var strategy = new KeepOldest(downIfAlone: true); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void KeepOldest_when_downIfAlone_must_down_oldest_if_it_was_the_only_remaining_node() + { + var unreachable = Members(Member(e, upNumber: 2), Member(b, upNumber: 3), Member(c, upNumber: 4)); + var remaining = Members(Member(a, upNumber: 1)); + + var strategy = new KeepOldest(downIfAlone: true); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(remaining); + } + + [Fact] + public void KeepReferee_must_down_remaining_if_referee_node_was_unreachable() + { + var referee = a; + var unreachable = Members(Member(referee), Member(d)); + var remaining = Members(Member(b), Member(c), Member(e)); + + var strategy = new KeepReferee(address: referee, downAllIfLessThanNodes: 1); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(remaining); + } + + [Fact] + public void KeepReferee_must_down_unreachable_if_referee_node_was_seen_in_remaining() + { + var referee = a; + var unreachable = Members(Member(d), Member(e)); + var remaining = Members(Member(referee), Member(b), Member(c)); + + var strategy = new KeepReferee(address: referee, downAllIfLessThanNodes: 1); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable); + } + + [Fact] + public void KeepReferee_must_down_all_nodes_if_referee_node_was_in_remaining_but_DownAllIfLessThanNodes_was_not_reached() + { + var referee = a; + var unreachable = Members(Member(d), Member(e), Member(c)); + var remaining = Members(Member(referee), Member(b)); + + var strategy = new KeepReferee(address: referee, downAllIfLessThanNodes: 3); + strategy.Apply(new NetworkPartitionContext(unreachable, remaining)).Should().Equal(unreachable.Union(remaining)); + } + } +} \ No newline at end of file diff --git a/src/core/Akka.Cluster/AutoDown.cs b/src/core/Akka.Cluster/AutoDown.cs index 4840c655e3e..3f1c1d48b33 100644 --- a/src/core/Akka.Cluster/AutoDown.cs +++ b/src/core/Akka.Cluster/AutoDown.cs @@ -192,55 +192,36 @@ protected override void PostStop() /// TBD protected override void OnReceive(object message) { - var state = message as ClusterEvent.CurrentClusterState; - if (state != null) + switch (message) { - _leader = state.Leader != null && state.Leader.Equals(SelfAddress); - foreach (var m in state.Unreachable) UnreachableMember(m); - return; - } - - var unreachableMember = message as ClusterEvent.UnreachableMember; - if (unreachableMember != null) - { - UnreachableMember(unreachableMember.Member); - return; - } - - var reachableMember = message as ClusterEvent.ReachableMember; - if (reachableMember != null) - { - Remove(reachableMember.Member.UniqueAddress); - return; - } - var memberRemoved = message as ClusterEvent.MemberRemoved; - if (memberRemoved != null) - { - Remove(memberRemoved.Member.UniqueAddress); - return; - } - - var leaderChanged = message as ClusterEvent.LeaderChanged; - if (leaderChanged != null) - { - _leader = leaderChanged.Leader != null && leaderChanged.Leader.Equals(SelfAddress); - if (_leader) - { - foreach(var node in _pendingUnreachable) Down(node.Address); - _pendingUnreachable = ImmutableHashSet.Create(); - } - return; - } - - var unreachableTimeout = message as AutoDown.UnreachableTimeout; - if (unreachableTimeout != null) - { - if (_scheduledUnreachable.ContainsKey(unreachableTimeout.Node)) - { - _scheduledUnreachable = _scheduledUnreachable.Remove(unreachableTimeout.Node); - DownOrAddPending(unreachableTimeout.Node); - } - return; + case ClusterEvent.CurrentClusterState state: + _leader = state.Leader != null && state.Leader.Equals(SelfAddress); + foreach (var m in state.Unreachable) UnreachableMember(m); + return; + case ClusterEvent.UnreachableMember unreachableMember: + UnreachableMember(unreachableMember.Member); + return; + case ClusterEvent.ReachableMember reachableMember: + Remove(reachableMember.Member.UniqueAddress); + return; + case ClusterEvent.MemberRemoved memberRemoved: + Remove(memberRemoved.Member.UniqueAddress); + return; + case ClusterEvent.LeaderChanged leaderChanged: + _leader = leaderChanged.Leader != null && leaderChanged.Leader.Equals(SelfAddress); + if (_leader) + { + foreach(var node in _pendingUnreachable) Down(node.Address); + _pendingUnreachable = ImmutableHashSet.Create(); + } + return; + case AutoDown.UnreachableTimeout unreachableTimeout: + if (_scheduledUnreachable.ContainsKey(unreachableTimeout.Node)) + { + _scheduledUnreachable = _scheduledUnreachable.Remove(unreachableTimeout.Node); + DownOrAddPending(unreachableTimeout.Node); + } + return; } } diff --git a/src/core/Akka.Cluster/Configuration/Cluster.conf b/src/core/Akka.Cluster/Configuration/Cluster.conf index 770cc8e914a..677b7229b37 100644 --- a/src/core/Akka.Cluster/Configuration/Cluster.conf +++ b/src/core/Akka.Cluster/Configuration/Cluster.conf @@ -166,6 +166,44 @@ akka { } + split-brain-resolver { + # Enable one of the available strategies (see descriptions below): + # static-quorum, keep-majority, keep-oldest, keep-referee + active-strategy = off + + # Decision is taken by the strategy when there has been no membership or + # reachability changes for this duration, i.e. the cluster state is stable. + stable-after = 20s + + static-quorum { + # minimum number of nodes that the cluster must have + quorum-size = undefined + + # if the 'role' is defined the decision is based only on members with that 'role' + role = "" + } + + keep-majority { + # if the 'role' is defined the decision is based only on members with that 'role' + role = "" + } + + keep-oldest { + # Enable downing of the oldest node when it is partitioned from all other nodes + down-if-alone = on + + # if the 'role' is defined the decision is based only on members with that 'role', + # i.e. using the oldest member (singleton) within the nodes with that role + role = "" + } + + keep-refree { + # referee address on the form of "akka.tcp://system@hostname:port" + address = "" + down-all-if-less-than-nodes = 1 + } + } + # If the tick-duration of the default scheduler is longer than the # tick-duration configured here a dedicated scheduler will be used for # periodic tasks of the cluster, otherwise the default scheduler is used. diff --git a/src/core/Akka.Cluster/SplitBrainResolver.cs b/src/core/Akka.Cluster/SplitBrainResolver.cs new file mode 100644 index 00000000000..79783141ddb --- /dev/null +++ b/src/core/Akka.Cluster/SplitBrainResolver.cs @@ -0,0 +1,344 @@ +//----------------------------------------------------------------------- +// +// Copyright (C) 2009-2016 Lightbend Inc. +// Copyright (C) 2013-2016 Akka.NET project +// +//----------------------------------------------------------------------- + +using System; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Linq; +using Akka.Actor; +using Akka.Configuration; +using Akka.Event; + +namespace Akka.Cluster +{ + public sealed class SplitBrainResolver : IDowningProvider + { + private readonly ClusterSettings _clusterSettings; + + public SplitBrainResolver(ActorSystem system) + { + _clusterSettings = Cluster.Get(system).Settings; + var config = system.Settings.Config.GetConfig("akka.cluster.split-brain-resolver"); + + StableAfter = config.GetTimeSpan("stable-after"); + Strategy = ResolveSplitBrainStrategy(config); + } + + public TimeSpan DownRemovalMargin => _clusterSettings.DownRemovalMargin; + public TimeSpan StableAfter { get; } + public Props DowningActorProps => SplitBrainDecider.Props(StableAfter, Strategy); + + internal ISplitBrainStrategy Strategy { get; } + + private ISplitBrainStrategy ResolveSplitBrainStrategy(Config config) + { + var activeStrategy = config.GetString("active-strategy"); + switch (activeStrategy) + { + case "static-quorum": return new StaticQuorum(config.GetConfig("static-quorum")); + case "keep-majority": return new KeepMajority(config.GetConfig("keep-majority")); + case "keep-oldest": return new KeepOldest(config.GetConfig("keep-oldest")); + case "keep-referee": return new KeepReferee(config.GetConfig("keep-referee")); + default: throw new ArgumentException($"`akka.cluster.split-brain-resolver.active-strategy` setting not recognized: [{activeStrategy}]. Available options are: static-quorum, keep-majority, keep-oldest, keep-referee."); + } + } + } + + internal sealed class NetworkPartitionContext + { + /// + /// A set of nodes, that have been detected as unreachable since cluster state stability has been reached. + /// + public ImmutableSortedSet Unreachable { get; } + + /// + /// A set of nodes, that have been connected to a current cluster node since the last cluster state + /// stability has been reached. + /// + public ImmutableSortedSet Remaining { get; } + + public NetworkPartitionContext(ImmutableSortedSet unreachable, ImmutableSortedSet remaining) + { + Unreachable = unreachable; + Remaining = remaining; + } + } + + /// + /// A split brain resolver strategy used to determine which nodes should be downed when a network partition has been detected. + /// + internal interface ISplitBrainStrategy + { + /// + /// This strategy is expected to be applied among nodes with that role. + /// + string Role { get; } + + /// + /// Determines a behavior of the current cluster node in the face of network partition. + /// Returns a list of cluster nodes to be downed (by the current node). + /// + IEnumerable Apply(NetworkPartitionContext context); + } + + internal sealed class StaticQuorum : ISplitBrainStrategy + { + public StaticQuorum(Config config) : this( + quorumSize: config.GetInt("quorum-size"), + role: config.GetString("role")) + { } + + public StaticQuorum(int quorumSize, string role) + { + QuorumSize = quorumSize; + Role = role; + } + + public int QuorumSize { get; } + public string Role { get; } + + public IEnumerable Apply(NetworkPartitionContext context) + { + var remainingCount = string.IsNullOrEmpty(Role) + ? context.Remaining.Count + : context.Remaining.Count(m => m.HasRole(Role)); + + return remainingCount < QuorumSize + ? context.Remaining + : context.Unreachable; + } + public override string ToString() => $"StaticQuorum(quorumSize: {QuorumSize}, role: '{Role}')"; + } + + internal sealed class KeepMajority : ISplitBrainStrategy + { + public KeepMajority(Config config) : this( + role: config.GetString("role")) + { } + + public KeepMajority(string role = null) + { + Role = role; + } + + public string Role { get; } + + public IEnumerable Apply(NetworkPartitionContext context) + { + var remaining = MembersWithRole(context.Remaining); + var unreachable = MembersWithRole(context.Unreachable); + + if (remaining.Count < unreachable.Count) return context.Remaining; + else if (remaining.Count > unreachable.Count) return context.Unreachable; + else + { + // if the parts are of equal size the part containing the node with the lowest address is kept. + var oldest = remaining.Union(unreachable).First(); + return remaining.Contains(oldest) + ? context.Unreachable + : context.Remaining; + } + } + + private ImmutableSortedSet MembersWithRole(ImmutableSortedSet members) => string.IsNullOrEmpty(Role) + ? members + : members.Where(m => m.HasRole(Role)).ToImmutableSortedSet(); + + public override string ToString() => $"KeepMajority(role: '{Role}')"; + } + + internal sealed class KeepOldest : ISplitBrainStrategy + { + public KeepOldest(Config config) : this( + downIfAlone: config.GetBoolean("down-if-alone", true), + role: config.GetString("role")) + { } + + public KeepOldest(bool downIfAlone, string role = null) + { + DownIfAlone = downIfAlone; + Role = role; + } + + public string Role { get; } + public bool DownIfAlone { get; } + + public IEnumerable Apply(NetworkPartitionContext context) + { + var remaining = MembersWithRole(context.Remaining); + var unreachable = MembersWithRole(context.Unreachable); + + var oldest = remaining.Union(unreachable).ToImmutableSortedSet(Member.AgeOrdering).First(); + if (remaining.Contains(oldest)) + { + return DownIfAlone && context.Remaining.Count == 1 // oldest is current node, and it's alone + ? context.Remaining + : context.Unreachable; + } + else if (DownIfAlone && context.Unreachable.Count == 1) // oldest is unreachable, but it's alone + { + return context.Unreachable; + } + else return context.Remaining; + } + + private ImmutableSortedSet MembersWithRole(ImmutableSortedSet members) => string.IsNullOrEmpty(Role) + ? members + : members.Where(m => m.HasRole(Role)).ToImmutableSortedSet(); + + public override string ToString() => $"KeepOldest(downIfAlone: {DownIfAlone}, role: '{Role})'"; + } + + internal sealed class KeepReferee : ISplitBrainStrategy + { + public KeepReferee(Config config) : this( + address: Address.Parse(config.GetString("address")), + downAllIfLessThanNodes: config.GetInt("down-all-if-less-than-nodes", 1)) + { } + + public KeepReferee(Address address, int downAllIfLessThanNodes) + { + Address = address; + DownAllIfLessThanNodes = downAllIfLessThanNodes; + } + + public Address Address { get; } + public int DownAllIfLessThanNodes { get; } + public string Role => null; + + public IEnumerable Apply(NetworkPartitionContext context) + { + var isRefereeReachable = context.Remaining.Any(m => m.Address == Address); + + if (!isRefereeReachable) return context.Remaining; // referee is unreachable + else if (context.Remaining.Count < DownAllIfLessThanNodes) return context.Remaining.Union(context.Unreachable); // referee is reachable but there are too few remaining nodes + else return context.Unreachable; + } + + public override string ToString() => $"KeepReferee(refereeAddress: {Address}, downIfLessThanNodes: {DownAllIfLessThanNodes})"; + } + + internal sealed class SplitBrainDecider : UntypedActor + { + #region internal classes + + private sealed class StabilityReached + { + public static readonly StabilityReached Instance = new StabilityReached(); + private StabilityReached() { } + } + + #endregion + + public static Actor.Props Props(TimeSpan stableAfter, ISplitBrainStrategy strategy) => + Actor.Props.Create(() => new SplitBrainDecider(stableAfter, strategy)).WithDeploy(Deploy.Local); + + private readonly Cluster _cluster; + private readonly TimeSpan _stabilityTimeout; + private readonly ISplitBrainStrategy _strategy; + + private ImmutableSortedSet _reachable = ImmutableSortedSet.Empty; + private ImmutableSortedSet _unreachable = ImmutableSortedSet.Empty; + private ICancelable _stabilityTask; + private ILoggingAdapter _log; + + public SplitBrainDecider(TimeSpan stableAfter, ISplitBrainStrategy strategy) + { + if (strategy == null) throw new ArgumentNullException(nameof(strategy)); + + _stabilityTimeout = stableAfter; + _strategy = strategy; + _cluster = Cluster.Get(Context.System); + } + + public ILoggingAdapter Log => _log ?? (_log = Context.GetLogger()); + + protected override void PreStart() + { + base.PreStart(); + _cluster.Subscribe(Self, typeof(ClusterEvent.IMemberEvent), typeof(ClusterEvent.IReachabilityEvent)); + } + + protected override void PostStop() + { + _cluster.Unsubscribe(Self); + base.PostStop(); + } + + protected override void OnReceive(object message) + { + switch (message) + { + case ClusterEvent.CurrentClusterState state: + ResetStabilityTimeout(); + _reachable = state.Members.Where(m => m.Status == MemberStatus.Up).ToImmutableSortedSet(Member.AgeOrdering); + _unreachable = state.Unreachable.ToImmutableSortedSet(Member.AgeOrdering); + return; + case ClusterEvent.IMemberEvent memberEvent: + ResetStabilityTimeout(); + switch (memberEvent) + { + case ClusterEvent.MemberUp up: + _reachable = _reachable.Add(up.Member); + break; + case ClusterEvent.MemberRemoved removed: + _reachable = _reachable.Remove(removed.Member); + _unreachable = _unreachable.Remove(removed.Member); + break; + } + return; + case ClusterEvent.IReachabilityEvent reachabilityEvent: + ResetStabilityTimeout(); + switch (reachabilityEvent) + { + case ClusterEvent.ReachableMember reachable: + _reachable = _reachable.Add(reachable.Member); + _unreachable = _unreachable.Remove(reachable.Member); + break; + case ClusterEvent.UnreachableMember unreachable: + _reachable = _reachable.Remove(unreachable.Member); + _unreachable = _unreachable.Add(unreachable.Member); + break; + } + return; + case StabilityReached _: + HandleStabilityReached(); + return; + } + } + + private void HandleStabilityReached() + { + if (Log.IsInfoEnabled) + { + Log.Info("A network partition detected - unreachable nodes: [{0}], remaining: [{1}]", string.Join(", ", _unreachable.Select(m => m.Address)), string.Join(", ", _reachable.Select(m => m.Address))); + } + + var context = new NetworkPartitionContext(_unreachable, _reachable); + var nodesToDown = _strategy.Apply(context).ToImmutableArray(); + + if (nodesToDown.Length > 0) + { + if (Log.IsInfoEnabled) + { + Log.Info("A network partition has been detected. {0} decided to down following nodes: [{1}]", _strategy, string.Join(", ", nodesToDown)); + } + + foreach (var member in nodesToDown) + { + _cluster.Down(member.Address); + } + } + } + + private void ResetStabilityTimeout() + { + _stabilityTask?.Cancel(); + _stabilityTask = Context.System.Scheduler.ScheduleTellOnceCancelable(_stabilityTimeout, Self, StabilityReached.Instance, ActorRefs.NoSender); + } + } +} \ No newline at end of file