Replicate write actions before fsyncing them (#49746)

This commit fixes a number of issues with data replication: - Local and global checkpoints are not updated after the new operations have been fsynced, but might capture a state before the fsync. The reason why this probably went undetected for so long is that AsyncIOProcessor is synchronous if you index one item at a time, and hence working as intended unless you have a high enough level of concurrent indexing. As we rely in other places on the assumption that we have an up-to-date local checkpoint in case of synchronous translog durability, there's a risk for the local and global checkpoints not to be up-to-date after replication completes, and that this won't be corrected by the periodic global checkpoint sync. - AsyncIOProcessor also has another "bad" side effect here: if you index one bulk at a time, the bulk is always first fsynced on the primary before being sent to the replica. Further, if one thread is tasked by AsyncIOProcessor to drain the processing queue and fsync, other threads can easily pile more bulk requests on top of that thread. Things are not very fair here, and the thread might continue doing a lot more fsyncs before returning (as the other threads pile more and more on top), which blocks it from returning as a replication request (e.g. if this thread is on the primary, it blocks the replication requests to the replicas from going out, and delaying checkpoint advancement). This commit fixes all these issues, and also simplifies the code that coordinates all the after write actions.
elastic · Dec 3, 2019 · fbb92f5 · fbb92f5
1 parent 6e751f5
commit fbb92f5
Show file tree

Hide file tree

Showing 13 changed files with 240 additions and 238 deletions.
diff --git a/server/src/main/java/org/elasticsearch/action/support/replication/ReplicationOperation.java b/server/src/main/java/org/elasticsearch/action/support/replication/ReplicationOperation.java
@@ -46,6 +46,7 @@
 import java.util.Locale;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.LongSupplier;
 
 public class ReplicationOperation<
             Request extends ReplicationRequest<Request>,
@@ -110,8 +111,6 @@ public void execute() throws Exception {
 
     private void handlePrimaryResult(final PrimaryResultT primaryResult) {
         this.primaryResult = primaryResult;
-        primary.updateLocalCheckpointForShard(primary.routingEntry().allocationId().getId(), primary.localCheckpoint());
-        primary.updateGlobalCheckpointForShard(primary.routingEntry().allocationId().getId(), primary.globalCheckpoint());
         final ReplicaRequest replicaRequest = primaryResult.replicaRequest();
         if (replicaRequest != null) {
             if (logger.isTraceEnabled()) {
@@ -134,8 +133,26 @@ private void handlePrimaryResult(final PrimaryResultT primaryResult) {
             markUnavailableShardsAsStale(replicaRequest, replicationGroup);
             performOnReplicas(replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, replicationGroup);
         }
-        successfulShards.incrementAndGet();  // mark primary as successful
-        decPendingAndFinishIfNeeded();
+        primaryResult.runPostReplicationActions(new ActionListener<Void>() {
+
+            @Override
+            public void onResponse(Void aVoid) {
+                successfulShards.incrementAndGet();
+                try {
+                    updateCheckPoints(primary.routingEntry(), primary::localCheckpoint, primary::globalCheckpoint);
+                } finally {
+                    decPendingAndFinishIfNeeded();
+                }
+            }
+
+            @Override
+            public void onFailure(Exception e) {
+                logger.trace("[{}] op [{}] post replication actions failed for [{}]", primary.routingEntry().shardId(), opType, request);
+                // TODO: fail shard? This will otherwise have the local / global checkpoint info lagging, or possibly have replicas
+                // go out of sync with the primary
+                finishAsFailed(e);
+            }
+        });
     }
 
     private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, ReplicationGroup replicationGroup) {
@@ -176,16 +193,10 @@ private void performOnReplica(final ShardRouting shard, final ReplicaRequest rep
                 public void onResponse(ReplicaResponse response) {
                     successfulShards.incrementAndGet();
                     try {
-                        primary.updateLocalCheckpointForShard(shard.allocationId().getId(), response.localCheckpoint());
-                        primary.updateGlobalCheckpointForShard(shard.allocationId().getId(), response.globalCheckpoint());
-                    } catch (final AlreadyClosedException e) {
-                        // the index was deleted or this shard was never activated after a relocation; fall through and finish normally
-                    } catch (final Exception e) {
-                        // fail the primary but fall through and let the rest of operation processing complete
-                        final String message = String.format(Locale.ROOT, "primary failed updating local checkpoint for replica %s", shard);
-                        primary.failShard(message, e);
+                        updateCheckPoints(shard, response::localCheckpoint, response::globalCheckpoint);
+                    } finally {
+                        decPendingAndFinishIfNeeded();
                     }
-                    decPendingAndFinishIfNeeded();
                 }
 
                 @Override
@@ -211,6 +222,19 @@ public String toString() {
             });
     }
 
+    private void updateCheckPoints(ShardRouting shard, LongSupplier localCheckpointSupplier, LongSupplier globalCheckpointSupplier) {
+        try {
+            primary.updateLocalCheckpointForShard(shard.allocationId().getId(), localCheckpointSupplier.getAsLong());
+            primary.updateGlobalCheckpointForShard(shard.allocationId().getId(), globalCheckpointSupplier.getAsLong());
+        } catch (final AlreadyClosedException e) {
+            // the index was deleted or this shard was never activated after a relocation; fall through and finish normally
+        } catch (final Exception e) {
+            // fail the primary but fall through and let the rest of operation processing complete
+            final String message = String.format(Locale.ROOT, "primary failed updating local checkpoint for replica %s", shard);
+            primary.failShard(message, e);
+        }
+    }
+
     private void onNoLongerPrimary(Exception failure) {
         final Throwable cause = ExceptionsHelper.unwrapCause(failure);
         final boolean nodeIsClosing = cause instanceof NodeClosedException;
@@ -464,5 +488,11 @@ public interface PrimaryResult<RequestT extends ReplicationRequest<RequestT>> {
         @Nullable RequestT replicaRequest();
 
         void setShardInfo(ReplicationResponse.ShardInfo shardInfo);
+
+        /**
+         * Run actions to be triggered post replication
+         * @param listener calllback that is invoked after post replication actions have completed
+         * */
+        void runPostReplicationActions(ActionListener<Void> listener);
     }
 }
diff --git a/...rc/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java b/...rc/main/java/org/elasticsearch/action/support/replication/TransportReplicationAction.java
@@ -74,8 +74,6 @@
 import org.elasticsearch.transport.TransportException;
 import org.elasticsearch.transport.TransportRequest;
 import org.elasticsearch.transport.TransportRequestOptions;
-import org.elasticsearch.transport.TransportResponse;
-import org.elasticsearch.transport.TransportResponse.Empty;
 import org.elasticsearch.transport.TransportResponseHandler;
 import org.elasticsearch.transport.TransportService;
 
@@ -347,33 +345,32 @@ public void handleException(TransportException exp) {
                 } else {
                     setPhase(replicationTask, "primary");
 
-                    final ActionListener<Response> referenceClosingListener = ActionListener.wrap(response -> {
-                        primaryShardReference.close(); // release shard operation lock before responding to caller
-                        setPhase(replicationTask, "finished");
-                        onCompletionListener.onResponse(response);
-                    }, e -> handleException(primaryShardReference, e));
+                    final ActionListener<Response> responseListener = ActionListener.wrap(response -> {
+                        adaptResponse(response, primaryShardReference.indexShard);
 
-                    final ActionListener<Response> globalCheckpointSyncingListener = ActionListener.wrap(response -> {
                         if (syncGlobalCheckpointAfterOperation) {
-                            final IndexShard shard = primaryShardReference.indexShard;
                             try {
-                                shard.maybeSyncGlobalCheckpoint("post-operation");
+                                primaryShardReference.indexShard.maybeSyncGlobalCheckpoint("post-operation");
                             } catch (final Exception e) {
                                 // only log non-closed exceptions
                                 if (ExceptionsHelper.unwrap(
                                     e, AlreadyClosedException.class, IndexShardClosedException.class) == null) {
                                     // intentionally swallow, a missed global checkpoint sync should not fail this operation
                                     logger.info(
                                         new ParameterizedMessage(
-                                            "{} failed to execute post-operation global checkpoint sync", shard.shardId()), e);
+                                            "{} failed to execute post-operation global checkpoint sync",
+                                            primaryShardReference.indexShard.shardId()), e);
                                 }
                             }
                         }
-                        referenceClosingListener.onResponse(response);
-                    }, referenceClosingListener::onFailure);
+
+                        primaryShardReference.close(); // release shard operation lock before responding to caller
+                        setPhase(replicationTask, "finished");
+                        onCompletionListener.onResponse(response);
+                    }, e -> handleException(primaryShardReference, e));
 
                     new ReplicationOperation<>(primaryRequest.getRequest(), primaryShardReference,
-                        ActionListener.wrap(result -> result.respond(globalCheckpointSyncingListener), referenceClosingListener::onFailure),
+                        ActionListener.map(responseListener, result -> result.finalResponseIfSuccessful),
                         newReplicasProxy(), logger, actionName, primaryRequest.getPrimaryTerm()).execute();
                 }
             } catch (Exception e) {
@@ -394,10 +391,19 @@ public void onFailure(Exception e) {
 
     }
 
+    // allows subclasses to adapt the response
+    protected void adaptResponse(Response response, IndexShard indexShard) {
+
+    }
+
+    protected ActionListener<Response> wrapResponseActionListener(ActionListener<Response> listener, IndexShard shard) {
+        return listener;
+    }
+
     public static class PrimaryResult<ReplicaRequest extends ReplicationRequest<ReplicaRequest>,
             Response extends ReplicationResponse>
             implements ReplicationOperation.PrimaryResult<ReplicaRequest> {
-        final ReplicaRequest replicaRequest;
+        protected final ReplicaRequest replicaRequest;
         public final Response finalResponseIfSuccessful;
         public final Exception finalFailure;
 
@@ -430,11 +436,12 @@ public void setShardInfo(ReplicationResponse.ShardInfo shardInfo) {
             }
         }
 
-        public void respond(ActionListener<Response> listener) {
-            if (finalResponseIfSuccessful != null) {
-                listener.onResponse(finalResponseIfSuccessful);
-            } else {
+        @Override
+        public void runPostReplicationActions(ActionListener<Void> listener) {
+            if (finalFailure != null) {
                 listener.onFailure(finalFailure);
+            } else {
+                listener.onResponse(null);
             }
         }
     }
@@ -450,11 +457,11 @@ public ReplicaResult() {
             this(null);
         }
 
-        public void respond(ActionListener<TransportResponse.Empty> listener) {
-            if (finalFailure == null) {
-                listener.onResponse(TransportResponse.Empty.INSTANCE);
-            } else {
+        public void runPostReplicaActions(ActionListener<Void> listener) {
+            if (finalFailure != null) {
                 listener.onFailure(finalFailure);
+            } else {
+                listener.onResponse(null);
             }
         }
     }
@@ -504,10 +511,23 @@ public void onResponse(Releasable releasable) {
             try {
                 assert replica.getActiveOperationsCount() != 0 : "must perform shard operation under a permit";
                 final ReplicaResult replicaResult = shardOperationOnReplica(replicaRequest.getRequest(), replica);
-                releasable.close(); // release shard operation lock before responding to caller
-                final TransportReplicationAction.ReplicaResponse response =
-                        new ReplicaResponse(replica.getLocalCheckpoint(), replica.getLastSyncedGlobalCheckpoint());
-                replicaResult.respond(new ResponseListener(response));
+                replicaResult.runPostReplicaActions(
+                    ActionListener.wrap(r -> {
+                        final TransportReplicationAction.ReplicaResponse response =
+                            new ReplicaResponse(replica.getLocalCheckpoint(), replica.getLastSyncedGlobalCheckpoint());
+                        releasable.close(); // release shard operation lock before responding to caller
+                        if (logger.isTraceEnabled()) {
+                            logger.trace("action [{}] completed on shard [{}] for request [{}]", transportReplicaAction,
+                                replicaRequest.getRequest().shardId(),
+                                replicaRequest.getRequest());
+                        }
+                        setPhase(task, "finished");
+                        onCompletionListener.onResponse(response);
+                    }, e -> {
+                        Releasables.closeWhileHandlingException(releasable); // release shard operation lock before responding to caller
+                        this.responseWithFailure(e);
+                    })
+                );
             } catch (final Exception e) {
                 Releasables.closeWhileHandlingException(releasable); // release shard operation lock before responding to caller
                 AsyncReplicaAction.this.onFailure(e);
@@ -565,33 +585,6 @@ protected void doRun() throws Exception {
             acquireReplicaOperationPermit(replica, replicaRequest.getRequest(), this, replicaRequest.getPrimaryTerm(),
                 replicaRequest.getGlobalCheckpoint(), replicaRequest.getMaxSeqNoOfUpdatesOrDeletes());
         }
-
-        /**
-         * Listens for the response on the replica and sends the response back to the primary.
-         */
-        private class ResponseListener implements ActionListener<TransportResponse.Empty> {
-            private final ReplicaResponse replicaResponse;
-
-            ResponseListener(ReplicaResponse replicaResponse) {
-                this.replicaResponse = replicaResponse;
-            }
-
-            @Override
-            public void onResponse(Empty response) {
-                if (logger.isTraceEnabled()) {
-                    logger.trace("action [{}] completed on shard [{}] for request [{}]", transportReplicaAction,
-                        replicaRequest.getRequest().shardId(),
-                        replicaRequest.getRequest());
-                }
-                setPhase(task, "finished");
-                onCompletionListener.onResponse(replicaResponse);
-            }
-
-            @Override
-            public void onFailure(Exception e) {
-                responseWithFailure(e);
-            }
-        }
     }
 
     private IndexShard getIndexShard(final ShardId shardId) {