Skip to content

Commit

Permalink
Replicate write actions before fsyncing them (#49746)
Browse files Browse the repository at this point in the history
This commit fixes a number of issues with data replication:

- Local and global checkpoints are not updated after the new operations have been fsynced, but
might capture a state before the fsync. The reason why this probably went undetected for so
long is that AsyncIOProcessor is synchronous if you index one item at a time, and hence working
as intended unless you have a high enough level of concurrent indexing. As we rely in other
places on the assumption that we have an up-to-date local checkpoint in case of synchronous
translog durability, there's a risk for the local and global checkpoints not to be up-to-date after
replication completes, and that this won't be corrected by the periodic global checkpoint sync.
- AsyncIOProcessor also has another "bad" side effect here: if you index one bulk at a time, the
bulk is always first fsynced on the primary before being sent to the replica. Further, if one thread
is tasked by AsyncIOProcessor to drain the processing queue and fsync, other threads can
easily pile more bulk requests on top of that thread. Things are not very fair here, and the thread
might continue doing a lot more fsyncs before returning (as the other threads pile more and
more on top), which blocks it from returning as a replication request (e.g. if this thread is on the
primary, it blocks the replication requests to the replicas from going out, and delaying
checkpoint advancement).

This commit fixes all these issues, and also simplifies the code that coordinates all the after
write actions.
  • Loading branch information
ywelsch committed Dec 3, 2019
1 parent 6e751f5 commit fbb92f5
Show file tree
Hide file tree
Showing 13 changed files with 240 additions and 238 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import java.util.Locale;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.LongSupplier;

public class ReplicationOperation<
Request extends ReplicationRequest<Request>,
Expand Down Expand Up @@ -110,8 +111,6 @@ public void execute() throws Exception {

private void handlePrimaryResult(final PrimaryResultT primaryResult) {
this.primaryResult = primaryResult;
primary.updateLocalCheckpointForShard(primary.routingEntry().allocationId().getId(), primary.localCheckpoint());
primary.updateGlobalCheckpointForShard(primary.routingEntry().allocationId().getId(), primary.globalCheckpoint());
final ReplicaRequest replicaRequest = primaryResult.replicaRequest();
if (replicaRequest != null) {
if (logger.isTraceEnabled()) {
Expand All @@ -134,8 +133,26 @@ private void handlePrimaryResult(final PrimaryResultT primaryResult) {
markUnavailableShardsAsStale(replicaRequest, replicationGroup);
performOnReplicas(replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, replicationGroup);
}
successfulShards.incrementAndGet(); // mark primary as successful
decPendingAndFinishIfNeeded();
primaryResult.runPostReplicationActions(new ActionListener<Void>() {

@Override
public void onResponse(Void aVoid) {
successfulShards.incrementAndGet();
try {
updateCheckPoints(primary.routingEntry(), primary::localCheckpoint, primary::globalCheckpoint);
} finally {
decPendingAndFinishIfNeeded();
}
}

@Override
public void onFailure(Exception e) {
logger.trace("[{}] op [{}] post replication actions failed for [{}]", primary.routingEntry().shardId(), opType, request);
// TODO: fail shard? This will otherwise have the local / global checkpoint info lagging, or possibly have replicas
// go out of sync with the primary
finishAsFailed(e);
}
});
}

private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, ReplicationGroup replicationGroup) {
Expand Down Expand Up @@ -176,16 +193,10 @@ private void performOnReplica(final ShardRouting shard, final ReplicaRequest rep
public void onResponse(ReplicaResponse response) {
successfulShards.incrementAndGet();
try {
primary.updateLocalCheckpointForShard(shard.allocationId().getId(), response.localCheckpoint());
primary.updateGlobalCheckpointForShard(shard.allocationId().getId(), response.globalCheckpoint());
} catch (final AlreadyClosedException e) {
// the index was deleted or this shard was never activated after a relocation; fall through and finish normally
} catch (final Exception e) {
// fail the primary but fall through and let the rest of operation processing complete
final String message = String.format(Locale.ROOT, "primary failed updating local checkpoint for replica %s", shard);
primary.failShard(message, e);
updateCheckPoints(shard, response::localCheckpoint, response::globalCheckpoint);
} finally {
decPendingAndFinishIfNeeded();
}
decPendingAndFinishIfNeeded();
}

@Override
Expand All @@ -211,6 +222,19 @@ public String toString() {
});
}

private void updateCheckPoints(ShardRouting shard, LongSupplier localCheckpointSupplier, LongSupplier globalCheckpointSupplier) {
try {
primary.updateLocalCheckpointForShard(shard.allocationId().getId(), localCheckpointSupplier.getAsLong());
primary.updateGlobalCheckpointForShard(shard.allocationId().getId(), globalCheckpointSupplier.getAsLong());
} catch (final AlreadyClosedException e) {
// the index was deleted or this shard was never activated after a relocation; fall through and finish normally
} catch (final Exception e) {
// fail the primary but fall through and let the rest of operation processing complete
final String message = String.format(Locale.ROOT, "primary failed updating local checkpoint for replica %s", shard);
primary.failShard(message, e);
}
}

private void onNoLongerPrimary(Exception failure) {
final Throwable cause = ExceptionsHelper.unwrapCause(failure);
final boolean nodeIsClosing = cause instanceof NodeClosedException;
Expand Down Expand Up @@ -464,5 +488,11 @@ public interface PrimaryResult<RequestT extends ReplicationRequest<RequestT>> {
@Nullable RequestT replicaRequest();

void setShardInfo(ReplicationResponse.ShardInfo shardInfo);

/**
* Run actions to be triggered post replication
* @param listener calllback that is invoked after post replication actions have completed
* */
void runPostReplicationActions(ActionListener<Void> listener);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportRequest;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportResponse;
import org.elasticsearch.transport.TransportResponse.Empty;
import org.elasticsearch.transport.TransportResponseHandler;
import org.elasticsearch.transport.TransportService;

Expand Down Expand Up @@ -347,33 +345,32 @@ public void handleException(TransportException exp) {
} else {
setPhase(replicationTask, "primary");

final ActionListener<Response> referenceClosingListener = ActionListener.wrap(response -> {
primaryShardReference.close(); // release shard operation lock before responding to caller
setPhase(replicationTask, "finished");
onCompletionListener.onResponse(response);
}, e -> handleException(primaryShardReference, e));
final ActionListener<Response> responseListener = ActionListener.wrap(response -> {
adaptResponse(response, primaryShardReference.indexShard);

final ActionListener<Response> globalCheckpointSyncingListener = ActionListener.wrap(response -> {
if (syncGlobalCheckpointAfterOperation) {
final IndexShard shard = primaryShardReference.indexShard;
try {
shard.maybeSyncGlobalCheckpoint("post-operation");
primaryShardReference.indexShard.maybeSyncGlobalCheckpoint("post-operation");
} catch (final Exception e) {
// only log non-closed exceptions
if (ExceptionsHelper.unwrap(
e, AlreadyClosedException.class, IndexShardClosedException.class) == null) {
// intentionally swallow, a missed global checkpoint sync should not fail this operation
logger.info(
new ParameterizedMessage(
"{} failed to execute post-operation global checkpoint sync", shard.shardId()), e);
"{} failed to execute post-operation global checkpoint sync",
primaryShardReference.indexShard.shardId()), e);
}
}
}
referenceClosingListener.onResponse(response);
}, referenceClosingListener::onFailure);

primaryShardReference.close(); // release shard operation lock before responding to caller
setPhase(replicationTask, "finished");
onCompletionListener.onResponse(response);
}, e -> handleException(primaryShardReference, e));

new ReplicationOperation<>(primaryRequest.getRequest(), primaryShardReference,
ActionListener.wrap(result -> result.respond(globalCheckpointSyncingListener), referenceClosingListener::onFailure),
ActionListener.map(responseListener, result -> result.finalResponseIfSuccessful),
newReplicasProxy(), logger, actionName, primaryRequest.getPrimaryTerm()).execute();
}
} catch (Exception e) {
Expand All @@ -394,10 +391,19 @@ public void onFailure(Exception e) {

}

// allows subclasses to adapt the response
protected void adaptResponse(Response response, IndexShard indexShard) {

}

protected ActionListener<Response> wrapResponseActionListener(ActionListener<Response> listener, IndexShard shard) {
return listener;
}

public static class PrimaryResult<ReplicaRequest extends ReplicationRequest<ReplicaRequest>,
Response extends ReplicationResponse>
implements ReplicationOperation.PrimaryResult<ReplicaRequest> {
final ReplicaRequest replicaRequest;
protected final ReplicaRequest replicaRequest;
public final Response finalResponseIfSuccessful;
public final Exception finalFailure;

Expand Down Expand Up @@ -430,11 +436,12 @@ public void setShardInfo(ReplicationResponse.ShardInfo shardInfo) {
}
}

public void respond(ActionListener<Response> listener) {
if (finalResponseIfSuccessful != null) {
listener.onResponse(finalResponseIfSuccessful);
} else {
@Override
public void runPostReplicationActions(ActionListener<Void> listener) {
if (finalFailure != null) {
listener.onFailure(finalFailure);
} else {
listener.onResponse(null);
}
}
}
Expand All @@ -450,11 +457,11 @@ public ReplicaResult() {
this(null);
}

public void respond(ActionListener<TransportResponse.Empty> listener) {
if (finalFailure == null) {
listener.onResponse(TransportResponse.Empty.INSTANCE);
} else {
public void runPostReplicaActions(ActionListener<Void> listener) {
if (finalFailure != null) {
listener.onFailure(finalFailure);
} else {
listener.onResponse(null);
}
}
}
Expand Down Expand Up @@ -504,10 +511,23 @@ public void onResponse(Releasable releasable) {
try {
assert replica.getActiveOperationsCount() != 0 : "must perform shard operation under a permit";
final ReplicaResult replicaResult = shardOperationOnReplica(replicaRequest.getRequest(), replica);
releasable.close(); // release shard operation lock before responding to caller
final TransportReplicationAction.ReplicaResponse response =
new ReplicaResponse(replica.getLocalCheckpoint(), replica.getLastSyncedGlobalCheckpoint());
replicaResult.respond(new ResponseListener(response));
replicaResult.runPostReplicaActions(
ActionListener.wrap(r -> {
final TransportReplicationAction.ReplicaResponse response =
new ReplicaResponse(replica.getLocalCheckpoint(), replica.getLastSyncedGlobalCheckpoint());
releasable.close(); // release shard operation lock before responding to caller
if (logger.isTraceEnabled()) {
logger.trace("action [{}] completed on shard [{}] for request [{}]", transportReplicaAction,
replicaRequest.getRequest().shardId(),
replicaRequest.getRequest());
}
setPhase(task, "finished");
onCompletionListener.onResponse(response);
}, e -> {
Releasables.closeWhileHandlingException(releasable); // release shard operation lock before responding to caller
this.responseWithFailure(e);
})
);
} catch (final Exception e) {
Releasables.closeWhileHandlingException(releasable); // release shard operation lock before responding to caller
AsyncReplicaAction.this.onFailure(e);
Expand Down Expand Up @@ -565,33 +585,6 @@ protected void doRun() throws Exception {
acquireReplicaOperationPermit(replica, replicaRequest.getRequest(), this, replicaRequest.getPrimaryTerm(),
replicaRequest.getGlobalCheckpoint(), replicaRequest.getMaxSeqNoOfUpdatesOrDeletes());
}

/**
* Listens for the response on the replica and sends the response back to the primary.
*/
private class ResponseListener implements ActionListener<TransportResponse.Empty> {
private final ReplicaResponse replicaResponse;

ResponseListener(ReplicaResponse replicaResponse) {
this.replicaResponse = replicaResponse;
}

@Override
public void onResponse(Empty response) {
if (logger.isTraceEnabled()) {
logger.trace("action [{}] completed on shard [{}] for request [{}]", transportReplicaAction,
replicaRequest.getRequest().shardId(),
replicaRequest.getRequest());
}
setPhase(task, "finished");
onCompletionListener.onResponse(replicaResponse);
}

@Override
public void onFailure(Exception e) {
responseWithFailure(e);
}
}
}

private IndexShard getIndexShard(final ShardId shardId) {
Expand Down
Loading

0 comments on commit fbb92f5

Please sign in to comment.