-
Notifications
You must be signed in to change notification settings - Fork 25.8k
Implement adaptive replica selection #26128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bbeda3c
adc6f43
62fe599
34d38e4
e16754d
c7122f8
2ba499e
4f60243
b3dc3d9
205d78f
a41c75a
546f5fb
91f7a12
3ddc0ac
a451763
8674249
fc59d53
1beca3b
e945a5d
6df44f4
62747bf
e289581
dcae338
3d1dd2b
9c55c64
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,18 +29,24 @@ | |
| import org.elasticsearch.common.util.set.Sets; | ||
| import org.elasticsearch.index.Index; | ||
| import org.elasticsearch.index.shard.ShardId; | ||
| import org.elasticsearch.node.ResponseCollectorService; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.ArrayList; | ||
| import java.util.Arrays; | ||
| import java.util.Collections; | ||
| import java.util.Comparator; | ||
| import java.util.HashMap; | ||
| import java.util.HashSet; | ||
| import java.util.Iterator; | ||
| import java.util.LinkedList; | ||
| import java.util.List; | ||
| import java.util.Locale; | ||
| import java.util.Map; | ||
| import java.util.Optional; | ||
| import java.util.OptionalDouble; | ||
| import java.util.Set; | ||
| import java.util.stream.Collectors; | ||
|
|
||
| import static java.util.Collections.emptyMap; | ||
|
|
||
|
|
@@ -261,6 +267,165 @@ public ShardIterator activeInitializingShardsIt(int seed) { | |
| return new PlainShardIterator(shardId, ordered); | ||
| } | ||
|
|
||
| /** | ||
| * Returns an iterator over active and initializing shards, ordered by the adaptive replica | ||
| * selection forumla. Making sure though that its random within the active shards of the same | ||
| * (or missing) rank, and initializing shards are the last to iterate through. | ||
| */ | ||
| public ShardIterator activeInitializingShardsRankedIt(@Nullable ResponseCollectorService collector, | ||
| @Nullable Map<String, Long> nodeSearchCounts) { | ||
| final int seed = shuffler.nextSeed(); | ||
| if (allInitializingShards.isEmpty()) { | ||
| return new PlainShardIterator(shardId, | ||
| rankShardsAndUpdateStats(shuffler.shuffle(activeShards, seed), collector, nodeSearchCounts)); | ||
| } | ||
|
|
||
| ArrayList<ShardRouting> ordered = new ArrayList<>(activeShards.size() + allInitializingShards.size()); | ||
| List<ShardRouting> rankedActiveShards = | ||
| rankShardsAndUpdateStats(shuffler.shuffle(activeShards, seed), collector, nodeSearchCounts); | ||
| ordered.addAll(rankedActiveShards); | ||
| List<ShardRouting> rankedInitializingShards = | ||
| rankShardsAndUpdateStats(allInitializingShards, collector, nodeSearchCounts); | ||
| ordered.addAll(rankedInitializingShards); | ||
| return new PlainShardIterator(shardId, ordered); | ||
| } | ||
|
|
||
| private static Set<String> getAllNodeIds(final List<ShardRouting> shards) { | ||
| final Set<String> nodeIds = new HashSet<>(); | ||
| for (ShardRouting shard : shards) { | ||
| nodeIds.add(shard.currentNodeId()); | ||
| } | ||
| return nodeIds; | ||
| } | ||
|
|
||
| private static Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> | ||
| getNodeStats(final Set<String> nodeIds, final ResponseCollectorService collector) { | ||
|
|
||
| final Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> nodeStats = new HashMap<>(nodeIds.size()); | ||
| for (String nodeId : nodeIds) { | ||
| nodeStats.put(nodeId, collector.getNodeStatistics(nodeId)); | ||
| } | ||
| return nodeStats; | ||
| } | ||
|
|
||
| private static Map<String, Double> rankNodes(final Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> nodeStats, | ||
| final Map<String, Long> nodeSearchCounts) { | ||
| final Map<String, Double> nodeRanks = new HashMap<>(nodeStats.size()); | ||
| for (Map.Entry<String, Optional<ResponseCollectorService.ComputedNodeStats>> entry : nodeStats.entrySet()) { | ||
| Optional<ResponseCollectorService.ComputedNodeStats> maybeStats = entry.getValue(); | ||
| maybeStats.ifPresent(stats -> { | ||
| final String nodeId = entry.getKey(); | ||
| nodeRanks.put(nodeId, stats.rank(nodeSearchCounts.getOrDefault(nodeId, 1L))); | ||
| }); | ||
| } | ||
| return nodeRanks; | ||
| } | ||
|
|
||
| /** | ||
| * Adjust the for all other nodes' collected stats. In the original ranking paper there is no need to adjust other nodes' stats because | ||
| * Cassandra sends occasional requests to all copies of the data, so their stats will be updated during that broadcast phase. In | ||
| * Elasticsearch, however, we do not have that sort of broadcast-to-all behavior. In order to prevent a node that gets a high score and | ||
| * then never gets any more requests, we must ensure it eventually returns to a more normal score and can be a candidate for serving | ||
| * requests. | ||
| * | ||
| * This adjustment takes the "winning" node's statistics and adds the average of those statistics with each non-winning node. Let's say | ||
| * the winning node had a queue size of 10 and a non-winning node had a queue of 18. The average queue size is (10 + 18) / 2 = 14 so the | ||
| * non-winning node will have statistics added for a queue size of 14. This is repeated for the response time and service times as well. | ||
| */ | ||
| private static void adjustStats(final ResponseCollectorService collector, | ||
| final Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> nodeStats, | ||
| final String minNodeId, | ||
| final ResponseCollectorService.ComputedNodeStats minStats) { | ||
| if (minNodeId != null) { | ||
| for (Map.Entry<String, Optional<ResponseCollectorService.ComputedNodeStats>> entry : nodeStats.entrySet()) { | ||
| final String nodeId = entry.getKey(); | ||
| final Optional<ResponseCollectorService.ComputedNodeStats> maybeStats = entry.getValue(); | ||
| if (nodeId.equals(minNodeId) == false && maybeStats.isPresent()) { | ||
| final ResponseCollectorService.ComputedNodeStats stats = maybeStats.get(); | ||
| final int updatedQueue = (minStats.queueSize + stats.queueSize) / 2; | ||
| final long updatedResponse = (long) (minStats.responseTime + stats.responseTime) / 2; | ||
| final long updatedService = (long) (minStats.serviceTime + stats.serviceTime) / 2; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: the casts should not be necessary?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They are required, without them you get
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh, I had not realized we stored those times as doubles |
||
| collector.addNodeStatistics(nodeId, updatedQueue, updatedResponse, updatedService); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private static List<ShardRouting> rankShardsAndUpdateStats(List<ShardRouting> shards, final ResponseCollectorService collector, | ||
| final Map<String, Long> nodeSearchCounts) { | ||
| if (collector == null || nodeSearchCounts == null || shards.size() <= 1) { | ||
| return shards; | ||
| } | ||
|
|
||
| // Retrieve which nodes we can potentially send the query to | ||
| final Set<String> nodeIds = getAllNodeIds(shards); | ||
| final int nodeCount = nodeIds.size(); | ||
|
|
||
| final Map<String, Optional<ResponseCollectorService.ComputedNodeStats>> nodeStats = getNodeStats(nodeIds, collector); | ||
|
|
||
| // Retrieve all the nodes the shards exist on | ||
| final Map<String, Double> nodeRanks = rankNodes(nodeStats, nodeSearchCounts); | ||
|
|
||
| // sort all shards based on the shard rank | ||
| ArrayList<ShardRouting> sortedShards = new ArrayList<>(shards); | ||
| Collections.sort(sortedShards, new NodeRankComparator(nodeRanks)); | ||
|
|
||
| // adjust the non-winner nodes' stats so they will get a chance to receive queries | ||
| if (sortedShards.size() > 1) { | ||
| ShardRouting minShard = sortedShards.get(0); | ||
| // If the winning shard is not started we are ranking initializing | ||
| // shards, don't bother to do adjustments | ||
| if (minShard.started()) { | ||
| String minNodeId = minShard.currentNodeId(); | ||
| Optional<ResponseCollectorService.ComputedNodeStats> maybeMinStats = nodeStats.get(minNodeId); | ||
| if (maybeMinStats.isPresent()) { | ||
| adjustStats(collector, nodeStats, minNodeId, maybeMinStats.get()); | ||
| // Increase the number of searches for the "winning" node by one. | ||
| // Note that this doesn't actually affect the "real" counts, instead | ||
| // it only affects the captured node search counts, which is | ||
| // captured once for each query in TransportSearchAction | ||
| nodeSearchCounts.compute(minNodeId, (id, conns) -> conns == null ? 1 : conns + 1); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return sortedShards; | ||
| } | ||
|
|
||
| private static class NodeRankComparator implements Comparator<ShardRouting> { | ||
| private final Map<String, Double> nodeRanks; | ||
|
|
||
| NodeRankComparator(Map<String, Double> nodeRanks) { | ||
| this.nodeRanks = nodeRanks; | ||
| } | ||
|
|
||
| @Override | ||
| public int compare(ShardRouting s1, ShardRouting s2) { | ||
| if (s1.currentNodeId().equals(s2.currentNodeId())) { | ||
| // these shards on the the same node | ||
| return 0; | ||
| } | ||
| Double shard1rank = nodeRanks.get(s1.currentNodeId()); | ||
| Double shard2rank = nodeRanks.get(s2.currentNodeId()); | ||
| if (shard1rank != null) { | ||
| if (shard2rank != null) { | ||
| return shard1rank.compareTo(shard2rank); | ||
| } else { | ||
| // place non-nulls after null values | ||
| return 1; | ||
| } | ||
| } else { | ||
| if (shard2rank != null) { | ||
| // place nulls before non-null values | ||
| return -1; | ||
| } else { | ||
| // Both nodes do not have stats, they are equal | ||
| return 0; | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Returns true if no primaries are active or initializing for this shard | ||
| */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe mention that it is important to remove entries that have a value of zero to avoid memory leaks