Sync global checkpoint on pending in-sync shards (#43526)

ywelsch · web-flow · commit 0cfc9ff77594 · 2019-06-24T18:35:20.000+02:00
At the end of a peer recovery the primary wants to mark the replica as in-sync. For that the persisted local checkpoint of the replica needs to have caught up with the global checkpoint on the primary. If translog durability is set to ASYNC, this means that information about the persisted local checkpoint can lag on the primary and might need to be explicitly fetched through a global checkpoint sync action. Unfortunately, that action will only be triggered after 30 seconds, and, even worse, will only run based on what the in-sync shard copies say (see IndexShard.maybeSyncGlobalCheckpoint). As the replica has not been marked as in-sync yet, it is not taken into consideration, and the primary might have its global checkpoint equal to the max seq no, so it thinks nothing needs to be done. Closes #43486
diff --git a/server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java b/server/src/main/java/org/elasticsearch/index/seqno/ReplicationTracker.java
@@ -1069,7 +1069,7 @@ private Runnable getMasterUpdateOperationFromCurrentState() {
     }
 
     /**
-     * Whether the are shards blocking global checkpoint advancement. Used by tests.
+     * Whether the are shards blocking global checkpoint advancement.
      */
     public synchronized boolean pendingInSync() {
         assert primaryMode;
diff --git a/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java b/server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
@@ -2134,9 +2134,11 @@ public void maybeSyncGlobalCheckpoint(final String reason) {
             final long globalCheckpoint = replicationTracker.getGlobalCheckpoint();
             // async durability means that the local checkpoint might lag (as it is only advanced on fsync)
             // periodically ask for the newest local checkpoint by syncing the global checkpoint, so that ultimately the global
-            // checkpoint can be synced
+            // checkpoint can be synced. Also take into account that a shard might be pending sync, which means that it isn't
+            // in the in-sync set just yet but might be blocked on waiting for its persisted local checkpoint to catch up to
+            // the global checkpoint.
             final boolean syncNeeded =
-                (asyncDurability && stats.getGlobalCheckpoint() < stats.getMaxSeqNo())
+                (asyncDurability && (stats.getGlobalCheckpoint() < stats.getMaxSeqNo() || replicationTracker.pendingInSync()))
                     // check if the persisted global checkpoint
                     || StreamSupport
                             .stream(globalCheckpoints.values().spliterator(), false)
diff --git a/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java b/test/framework/src/main/java/org/elasticsearch/test/InternalTestCluster.java
@@ -1183,7 +1183,7 @@ private void assertNoPendingIndexOperations() throws Exception {
                     }
                 }
             }
-        });
+        }, 60, TimeUnit.SECONDS);
     }
 
     private void assertOpenTranslogReferences() throws Exception {

Original file line number	Diff line number	Diff line change
`@@ -1069,7 +1069,7 @@ private Runnable getMasterUpdateOperationFromCurrentState() {`
`1069`	`1069`	`}`
`1070`	`1070`
`1071`	`1071`	`/**`
`1072`		`- * Whether the are shards blocking global checkpoint advancement. Used by tests.`
	`1072`	`+ * Whether the are shards blocking global checkpoint advancement.`
`1073`	`1073`	`*/`
`1074`	`1074`	`public synchronized boolean pendingInSync() {`
`1075`	`1075`	`assert primaryMode;`
Original file line number	Diff line number	Diff line change
`@@ -1183,7 +1183,7 @@ private void assertNoPendingIndexOperations() throws Exception {`
`1183`	`1183`	`}`
`1184`	`1184`	`}`
`1185`	`1185`	`}`
`1186`		`- });`
	`1186`	`+ }, 60, TimeUnit.SECONDS);`
`1187`	`1187`	`}`
`1188`	`1188`
`1189`	`1189`	`private void assertOpenTranslogReferences() throws Exception {`