@@ -1521,6 +1521,8 @@ public void updateGlobalCheckpointOnPrimary() {
15211521 */
15221522 public void updateGlobalCheckpointOnReplica (final long globalCheckpoint ) {
15231523 verifyReplicationTarget ();
1524+ // we sample the recovery stage before sampling the local checkpoint or we are subject to a race condition in the below assertion
1525+ final RecoveryState .Stage stage = recoveryState ().getStage ();
15241526 final SequenceNumbersService seqNoService = getEngine ().seqNoService ();
15251527 final long localCheckpoint = seqNoService .getLocalCheckpoint ();
15261528 if (globalCheckpoint > localCheckpoint ) {
@@ -1530,10 +1532,10 @@ public void updateGlobalCheckpointOnReplica(final long globalCheckpoint) {
15301532 * case that the global checkpoint update from the primary is ahead of the local checkpoint on this shard. In this case, we
15311533 * ignore the global checkpoint update. This can happen if we are in the translog stage of recovery. Prior to this, the engine
15321534 * is not opened and this shard will not receive global checkpoint updates, and after this the shard will be contributing to
1533- * calculations of the the global checkpoint. However, we can not assert that we are in the translog stage of recovery here as
1534- * while the global checkpoint update may have emanated from the primary when we were in that state, we could subsequently move
1535- * to recovery finalization, or even finished recovery before the update arrives here.
1535+ * calculations of the the global checkpoint.
15361536 */
1537+ assert stage == RecoveryState .Stage .TRANSLOG
1538+ : "expected recovery stage [" + RecoveryState .Stage .TRANSLOG + "] but was [" + stage + "]" ;
15371539 return ;
15381540 }
15391541 seqNoService .updateGlobalCheckpointOnReplica (globalCheckpoint );
0 commit comments