Reduce recovery time with compress or secure transport.

Backport of elastic/elasticsearch#36981
crate · Sep 12, 2019 · 3c910e9 · 3c910e9
1 parent a3f58e2
commit 3c910e9
Show file tree

Hide file tree

Showing 18 changed files with 732 additions and 151 deletions.
diff --git a/app/src/main/dist/config/crate.yml b/app/src/main/dist/config/crate.yml
@@ -387,6 +387,10 @@ auth:
 # that are expected to take a long time.
 #indices.recovery.retry_internal_long_action_timeout: 30m
 
+# Controls the number of file chunk requests that can be sent
+# in parallel per recovery.
+# indices.recovery.max_concurrent_file_chunks: 2
+
 
 ################################# Discovery ##################################
 

diff --git a/blackbox/docs/appendices/release-notes/unreleased.rst b/blackbox/docs/appendices/release-notes/unreleased.rst
@@ -56,6 +56,11 @@ Breaking Changes
 Changes
 =======
 
+- Reduced recovery time by sending file-chunks concurrently. It apllies
+  only for when transport communication is secured or compressed. The number of
+  chunks is controlled by the :ref:`indices.recovery.max_concurrent_file_chunks
+  <indices.recovery.max_concurrent_file_chunks>` setting.
+
 - Allow user to control how table data is stored and accessed on a disk
   via the :ref:`store.type <table_parameter.store_type>` table parameter and
   :ref:`node.store.allow_mmap <node.store_allow_mmap>` node setting.

diff --git a/blackbox/docs/config/cluster.rst b/blackbox/docs/config/cluster.rst
@@ -879,6 +879,22 @@ Recovery
   fail. Defaults to :ref:`internal_action_long_timeout
   <indices.recovery.internal_action_long_timeout>`.
 
+.. _indices.recovery.max_concurrent_file_chunks:
+
+**indices.recovery.max_concurrent_file_chunks**
+  | *Default:*  ``2``
+  | *Runtime:*  ``yes``
+
+  Controls the number of file chunk requests that can be sent in parallel
+  per recovery. As multiple recoveries are already running in parallel,
+  controlled by :ref:`cluster.routing.allocation.node_concurrent_recoveries
+  <cluster.routing.allocation.node_concurrent_recoveries>`. Increasing this
+  expert-level setting might only help in situations where peer recovery of
+  a single shard is not reaching the total inbound and outbound peer recovery
+  traffic as configured by :ref:`indices.recovery.max_bytes_per_sec
+  <indices.recovery.max_bytes_per_sec>`. but is CPU-bound instead, typically
+  when using transport-level security or compression.
+
 Query circuit breaker
 ---------------------
 

diff --git a/blob/src/main/java/io/crate/blob/BlobService.java b/blob/src/main/java/io/crate/blob/BlobService.java
@@ -37,9 +37,12 @@
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.component.AbstractLifecycleComponent;
 import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.indices.recovery.PeerRecoverySourceService;
 import org.elasticsearch.transport.TransportService;
 
+import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING;
+
 public class BlobService extends AbstractLifecycleComponent {
 
     private final BlobIndicesService blobIndicesService;
@@ -50,6 +53,7 @@ public class BlobService extends AbstractLifecycleComponent {
     private final BlobTransferTarget blobTransferTarget;
     private final Client client;
     private final PipelineRegistry pipelineRegistry;
+    private final Settings settings;
 
     @Inject
     public BlobService(ClusterService clusterService,
@@ -59,7 +63,8 @@ public BlobService(ClusterService clusterService,
                        TransportService transportService,
                        BlobTransferTarget blobTransferTarget,
                        Client client,
-                       PipelineRegistry pipelineRegistry) {
+                       PipelineRegistry pipelineRegistry,
+                       Settings settings) {
         this.clusterService = clusterService;
         this.blobIndicesService = blobIndicesService;
         this.blobHeadRequestHandler = blobHeadRequestHandler;
@@ -68,6 +73,7 @@ public BlobService(ClusterService clusterService,
         this.blobTransferTarget = blobTransferTarget;
         this.client = client;
         this.pipelineRegistry = pipelineRegistry;
+        this.settings = settings;
     }
 
     public RemoteDigestBlob newBlob(String index, String digest) {
@@ -92,6 +98,7 @@ protected void doStart() throws ElasticsearchException {
                 recoveryTarget,
                 request,
                 fileChunkSizeInBytes,
+                INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING.get(settings),
                 transportService,
                 blobTransferTarget,
                 blobIndicesService

diff --git a/blob/src/main/java/io/crate/blob/recovery/BlobRecoveryHandler.java b/blob/src/main/java/io/crate/blob/recovery/BlobRecoveryHandler.java
@@ -79,10 +79,11 @@ public BlobRecoveryHandler(IndexShard shard,
                                RecoveryTargetHandler recoveryTarget,
                                StartRecoveryRequest request,
                                int fileChunkSizeInBytes,
+                               int maxConcurrentFileChunks,
                                final TransportService transportService,
                                BlobTransferTarget blobTransferTarget,
                                BlobIndicesService blobIndicesService) {
-        super(shard, recoveryTarget, request, fileChunkSizeInBytes);
+        super(shard, recoveryTarget, request, fileChunkSizeInBytes, maxConcurrentFileChunks);
         assert BlobIndex.isBlobIndex(shard.shardId().getIndexName()) : "Shard must belong to a blob index";
         this.blobShard = blobIndicesService.blobShardSafe(request.shardId());
         this.request = request;

diff --git a/es/es-server/src/main/java/org/elasticsearch/common/Numbers.java b/es/es-server/src/main/java/org/elasticsearch/common/Numbers.java
@@ -42,6 +42,24 @@ public static long bytesToLong(BytesRef bytes) {
         return (((long) high) << 32) | (low & 0x0ffffffffL);
     }
 
+    /**
+     * Converts a long to a byte array.
+     *
+     * @param val The long to convert to a byte array
+     * @return The byte array converted
+     */
+    public static byte[] longToBytes(long val) {
+        byte[] arr = new byte[8];
+        arr[0] = (byte) (val >>> 56);
+        arr[1] = (byte) (val >>> 48);
+        arr[2] = (byte) (val >>> 40);
+        arr[3] = (byte) (val >>> 32);
+        arr[4] = (byte) (val >>> 24);
+        arr[5] = (byte) (val >>> 16);
+        arr[6] = (byte) (val >>> 8);
+        arr[7] = (byte) (val);
+        return arr;
+    }
 
     /** Return the long that {@code n} stores, or throws an exception if the
      *  stored value cannot be converted to a long that stores the exact same

diff --git a/es/es-server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/es/es-server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
@@ -194,6 +194,7 @@ public void apply(Settings value, Settings current, Settings previous) {
         RecoverySettings.INDICES_RECOVERY_ACTIVITY_TIMEOUT_SETTING,
         RecoverySettings.INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING,
         RecoverySettings.INDICES_RECOVERY_INTERNAL_LONG_ACTION_TIMEOUT_SETTING,
+        RecoverySettings.INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING,
         ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_INITIAL_PRIMARIES_RECOVERIES_SETTING,
         ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_INCOMING_RECOVERIES_SETTING,
         ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING,

diff --git a/es/es-server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoverySourceService.java b/es/es-server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoverySourceService.java
@@ -176,9 +176,13 @@ synchronized RecoverySourceHandler addNewRecovery(StartRecoveryRequest request,
 
             private RecoverySourceHandler createRecoverySourceHandler(StartRecoveryRequest request, IndexShard shard) {
                 RecoverySourceHandler handler;
-                final RemoteRecoveryTargetHandler recoveryTarget =
-                    new RemoteRecoveryTargetHandler(request.recoveryId(), request.shardId(), transportService,
-                        request.targetNode(), recoverySettings, throttleTime -> shard.recoveryStats().addThrottleTime(throttleTime));
+                final RemoteRecoveryTargetHandler recoveryTarget = new RemoteRecoveryTargetHandler(
+                    request.recoveryId(),
+                    request.shardId(),
+                    transportService,
+                    request.targetNode(),
+                    recoverySettings,
+                    throttleTime -> shard.recoveryStats().addThrottleTime(throttleTime));
 
                 // CRATE_PATCH: used to inject BlobRecoveryHandler
                 int recoveryChunkSizeInBytes = recoverySettings.getChunkSize().bytesAsInt();
@@ -191,17 +195,23 @@ private RecoverySourceHandler createRecoverySourceHandler(StartRecoveryRequest r
 
                 if (handler != null){
                     return handler;
+                } else {
+                    return new RecoverySourceHandler(
+                        shard,
+                        recoveryTarget,
+                        request,
+                        recoveryChunkSizeInBytes,
+                        recoverySettings.getMaxConcurrentFileChunks());
                 }
-                return new RecoverySourceHandler(shard, recoveryTarget, request, recoveryChunkSizeInBytes);
             }
         }
     }
 
     @Nullable
-    RecoverySourceHandler getCustomRecoverySourceHandler(IndexShard shard,
-                                                         RemoteRecoveryTargetHandler recoveryTarget,
-                                                         StartRecoveryRequest request,
-                                                         int recoveryChunkSizeInBytes) {
+    private RecoverySourceHandler getCustomRecoverySourceHandler(IndexShard shard,
+                                                                 RemoteRecoveryTargetHandler recoveryTarget,
+                                                                 StartRecoveryRequest request,
+                                                                 int recoveryChunkSizeInBytes) {
         for (RecoverySourceHandlerProvider recoverySourceHandlerProvider : recoverySourceHandlerProviders) {
             RecoverySourceHandler handler = recoverySourceHandlerProvider.get(
                 shard, request, recoveryTarget, recoveryChunkSizeInBytes);

diff --git a/es/es-server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java b/es/es-server/src/main/java/org/elasticsearch/indices/recovery/PeerRecoveryTargetService.java
@@ -29,6 +29,8 @@
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.ElasticsearchTimeoutException;
 import org.elasticsearch.ExceptionsHelper;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.support.HandledTransportAction;
 import org.elasticsearch.action.support.PlainActionFuture;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ClusterStateObserver;
@@ -571,8 +573,7 @@ class FileChunkTransportRequestHandler implements TransportRequestHandler<Recove
 
         @Override
         public void messageReceived(final RecoveryFileChunkRequest request, TransportChannel channel, Task task) throws Exception {
-            try (RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId()
-            )) {
+            try (RecoveryRef recoveryRef = onGoingRecoveries.getRecoverySafe(request.recoveryId(), request.shardId())) {
                 final RecoveryTarget recoveryTarget = recoveryRef.target();
                 final RecoveryState.Index indexState = recoveryTarget.state().getIndex();
                 if (request.sourceThrottleTimeInNanos() != RecoveryState.Index.UNKNOWN) {
@@ -591,8 +592,17 @@ public void messageReceived(final RecoveryFileChunkRequest request, TransportCha
                     }
                 }
 
-                recoveryTarget.writeFileChunk(request.metadata(), request.position(), request.content(),
-                        request.lastChunk(), request.totalTranslogOps()
+                final ActionListener<TransportResponse> listener =
+                    new HandledTransportAction.ChannelActionListener<>(channel, Actions.FILE_CHUNK, request);
+                recoveryTarget.writeFileChunk(
+                    request.metadata(),
+                    request.position(),
+                    request.content(),
+                    request.lastChunk(),
+                    request.totalTranslogOps(),
+                    ActionListener.wrap(
+                        nullVal -> listener.onResponse(TransportResponse.Empty.INSTANCE),
+                        listener::onFailure)
                 );
             }
             channel.sendResponse(TransportResponse.Empty.INSTANCE);

diff --git a/es/es-server/src/main/java/org/elasticsearch/indices/recovery/RecoverySettings.java b/es/es-server/src/main/java/org/elasticsearch/indices/recovery/RecoverySettings.java
@@ -39,6 +39,12 @@ public class RecoverySettings {
         Setting.byteSizeSetting("indices.recovery.max_bytes_per_sec", new ByteSizeValue(40, ByteSizeUnit.MB),
             Property.Dynamic, Property.NodeScope);
 
+    /**
+     * Controls the maximum number of file chunk requests that can be sent concurrently from the source node to the target node.
+     */
+    public static final Setting<Integer> INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING =
+        Setting.intSetting("indices.recovery.max_concurrent_file_chunks", 2, 1, 5, Property.Dynamic, Property.NodeScope);
+
     /**
      * how long to wait before retrying after issues cause by cluster state syncing between nodes
      * i.e., local node is not yet known on remote node, remote shard not yet started etc.
@@ -78,6 +84,7 @@ public class RecoverySettings {
     public static final ByteSizeValue DEFAULT_CHUNK_SIZE = new ByteSizeValue(512, ByteSizeUnit.KB);
 
     private volatile ByteSizeValue maxBytesPerSec;
+    private volatile int maxConcurrentFileChunks;
     private volatile SimpleRateLimiter rateLimiter;
     private volatile TimeValue retryDelayStateSync;
     private volatile TimeValue retryDelayNetwork;
@@ -92,6 +99,7 @@ public RecoverySettings(Settings settings, ClusterSettings clusterSettings) {
         this.retryDelayStateSync = INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING.get(settings);
         // doesn't have to be fast as nodes are reconnected every 10s by default (see InternalClusterService.ReconnectToNodes)
         // and we want to give the master time to remove a faulty node
+        this.maxConcurrentFileChunks = INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING.get(settings);
         this.retryDelayNetwork = INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING.get(settings);
 
         this.internalActionTimeout = INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.get(settings);
@@ -109,6 +117,7 @@ public RecoverySettings(Settings settings, ClusterSettings clusterSettings) {
         logger.debug("using max_bytes_per_sec[{}]", maxBytesPerSec);
 
         clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING, this::setMaxBytesPerSec);
+        clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_MAX_CONCURRENT_FILE_CHUNKS_SETTING, this::setMaxConcurrentFileChunks);
         clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_RETRY_DELAY_STATE_SYNC_SETTING, this::setRetryDelayStateSync);
         clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING, this::setRetryDelayNetwork);
         clusterSettings.addSettingsUpdateConsumer(INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING, this::setInternalActionTimeout);
@@ -180,4 +189,12 @@ private void setMaxBytesPerSec(ByteSizeValue maxBytesPerSec) {
             rateLimiter = new SimpleRateLimiter(maxBytesPerSec.getMbFrac());
         }
     }
+
+    public int getMaxConcurrentFileChunks() {
+        return maxConcurrentFileChunks;
+    }
+
+    private void setMaxConcurrentFileChunks(int maxConcurrentFileChunks) {
+        this.maxConcurrentFileChunks = maxConcurrentFileChunks;
+    }
 }