Skip to content

Commit

Permalink
[Bugfix] Fix NPE in ReplicaShardAllocator (opensearch-project#13993)
Browse files Browse the repository at this point in the history
Signed-off-by: Daniil Roman <daniilroman.cv@gmail.com>
  • Loading branch information
DaniilRoman committed Jun 18, 2024
1 parent 0d38d14 commit f96ce27
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ protected Runnable cancelExistingRecoveryForBetterMatch(
Metadata metadata = allocation.metadata();
RoutingNodes routingNodes = allocation.routingNodes();
ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard.shardId());
assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary";
if (primaryShard == null) {
logger.trace("{}: no active primary shard found or allocated, letting actual allocation figure it out", shard);
return null;
}
assert primaryShard.currentNodeId() != null;
final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,25 @@ public void testDoNotCancelForBrokenNode() {
assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED), empty());
}

public void testDoNotCancelForInactivePrimaryNode() {
RoutingAllocation allocation = oneInactivePrimaryOnNode1And1ReplicaRecovering(yesAllocationDeciders(), null);
testBatchAllocator.addData(
node1,
null,
"MATCH",
null,
new StoreFileMetadata("file1", 10, "MATCH_CHECKSUM", MIN_SUPPORTED_LUCENE_VERSION)
).addData(node2, randomSyncId(), null, new StoreFileMetadata("file1", 10, "MATCH_CHECKSUM", MIN_SUPPORTED_LUCENE_VERSION));

testBatchAllocator.processExistingRecoveries(
allocation,
Collections.singletonList(new ArrayList<>(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING)))
);

assertThat(allocation.routingNodesChanged(), equalTo(false));
assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED), empty());
}

public void testAllocateUnassignedBatchThrottlingAllocationDeciderIsHonoured() throws InterruptedException {
ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
AllocationDeciders allocationDeciders = randomAllocationDeciders(
Expand Down Expand Up @@ -872,6 +891,41 @@ private RoutingAllocation onePrimaryOnNode1And1ReplicaRecovering(AllocationDecid
);
}

private RoutingAllocation oneInactivePrimaryOnNode1And1ReplicaRecovering(AllocationDeciders deciders, UnassignedInfo unassignedInfo) {
ShardRouting primaryShard = TestShardRouting.newShardRouting(shardId, node1.getId(), true, ShardRoutingState.INITIALIZING);
RoutingTable routingTable = RoutingTable.builder()
.add(
IndexRoutingTable.builder(shardId.getIndex())
.addIndexShard(
new IndexShardRoutingTable.Builder(shardId).addShard(primaryShard)
.addShard(
TestShardRouting.newShardRouting(
shardId,
node2.getId(),
null,
false,
ShardRoutingState.INITIALIZING,
unassignedInfo
)
)
.build()
)
)
.build();
ClusterState state = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
.routingTable(routingTable)
.nodes(DiscoveryNodes.builder().add(node1).add(node2))
.build();
return new RoutingAllocation(
deciders,
new RoutingNodes(state, false),
state,
ClusterInfo.EMPTY,
SnapshotShardSizeInfo.EMPTY,
System.nanoTime()
);
}

private RoutingAllocation onePrimaryOnNode1And1ReplicaRecovering(AllocationDeciders deciders) {
return onePrimaryOnNode1And1ReplicaRecovering(deciders, new UnassignedInfo(UnassignedInfo.Reason.CLUSTER_RECOVERED, null));
}
Expand Down

0 comments on commit f96ce27

Please sign in to comment.