apache · adoroszlai · Jan 6, 2024 · Jun 20, 2023 · Jun 28, 2023 · Jun 28, 2023
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java
@@ -257,7 +257,17 @@ public final class OzoneConfigKeys {
   public static final String OZONE_BLOCK_DELETING_SERVICE_TIMEOUT =
       "ozone.block.deleting.service.timeout";
   public static final String OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT
-      = "300s"; // 300s for default
+      = "300s"; // 300s for default deleteBlocksCommandLimit
+
+  /**
+   * A limit to restrict the total number of delete block commands queued on a
+   * datanode. Note this is intended to be a temporary config until we have a
+   * more dynamic way of limiting load.
+   */
+  public static final String OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT =
+      "ozone.block.deleting.pending.command.limit";
+  public static final int OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT_DEFAULT
+      = 5; // same with hdds.datanode.block.delete.queue.limit
 
   public static final String OZONE_SNAPSHOT_SST_FILTERING_SERVICE_TIMEOUT =
       "ozone.sst.filtering.service.timeout";

diff --git a/...ds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java b/...ds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
@@ -41,6 +41,7 @@
 import org.apache.hadoop.hdds.scm.ha.SCMServiceManager;
 import org.apache.hadoop.hdds.scm.node.NodeManager;
 import org.apache.hadoop.hdds.scm.node.NodeStatus;
+import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
 import org.apache.hadoop.hdds.server.events.EventPublisher;
 import org.apache.hadoop.hdds.utils.BackgroundService;
 import org.apache.hadoop.hdds.utils.BackgroundTask;
@@ -57,6 +58,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT;
+import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT_DEFAULT;
 import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT;
 import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_SERVICE_TIMEOUT_DEFAULT;
 
@@ -90,6 +93,7 @@ public class SCMBlockDeletingService extends BackgroundService
 
   private long safemodeExitMillis = 0;
   private final long safemodeExitRunDelayMillis;
+  private final long deleteBlocksPendingCommandLimit;
   private final Clock clock;
 
   @SuppressWarnings("parameternumber")
@@ -110,6 +114,9 @@ public SCMBlockDeletingService(DeletedBlockLog deletedBlockLog,
         HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT,
         HddsConfigKeys.HDDS_SCM_WAIT_TIME_AFTER_SAFE_MODE_EXIT_DEFAULT,
         TimeUnit.MILLISECONDS);
+    this.deleteBlocksPendingCommandLimit = conf.getInt(
+        OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT,
+        OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT_DEFAULT);
     this.clock = clock;
     this.deletedBlockLog = deletedBlockLog;
     this.nodeManager = nodeManager;
@@ -156,13 +163,12 @@ public EmptyTaskResult call() throws Exception {
       List<DatanodeDetails> datanodes =
           nodeManager.getNodes(NodeStatus.inServiceHealthy());
       if (datanodes != null) {
-        // When DN node is healthy and in-service, and previous commands 
-        // are handled for deleteBlocks Type, then it will be considered
-        // in this iteration
-        final Set<DatanodeDetails> included = datanodes.stream().filter(
-            dn -> nodeManager.getCommandQueueCount(dn.getUuid(),
-                Type.deleteBlocksCommand) == 0).collect(Collectors.toSet());
         try {
+          // When DN node is healthy and in-service, and their number of
+          // 'deleteBlocks' type commands is below the limit.
+          // These nodes will be considered for this iteration.
+          final Set<DatanodeDetails> included =
+              getDatanodesWithinCommandLimit(datanodes);
           DatanodeDeletedBlockTransactions transactions =
               deletedBlockLog.getTransactions(blockDeleteLimitSize, included);
 
@@ -203,15 +209,15 @@ public EmptyTaskResult call() throws Exception {
           deletedBlockLog.incrementCount(new ArrayList<>(processedTxIDs));
         } catch (NotLeaderException nle) {
           LOG.warn("Skip current run, since not leader any more.", nle);
-          return EmptyTaskResult.newResult();
+        } catch (NodeNotFoundException e) {
+          LOG.error("Datanode not found in NodeManager. Should not happen", e);
         } catch (IOException e) {
           // We may tolerate a number of failures for sometime
           // but if it continues to fail, at some point we need to raise
           // an exception and probably fail the SCM ? At present, it simply
           // continues to retry the scanning.
           LOG.error("Failed to get block deletion transactions from delTX log",
               e);
-          return EmptyTaskResult.newResult();
         }
       }
 
@@ -273,4 +279,24 @@ public void stop() {
   public ScmBlockDeletingServiceMetrics getMetrics() {
     return this.metrics;
   }
+
+  /**
+   * Filters and returns a set of healthy datanodes that have not exceeded
+   * the deleteBlocksPendingCommandLimit.
+   *
+   * @param datanodes a list of DatanodeDetails
+   * @return a set of filtered DatanodeDetails
+   */
+  @VisibleForTesting
+  protected Set<DatanodeDetails> getDatanodesWithinCommandLimit(
+      List<DatanodeDetails> datanodes) throws NodeNotFoundException {
+    final Set<DatanodeDetails> included = new HashSet<>();
+    for (DatanodeDetails dn : datanodes) {
+      if (nodeManager.getTotalDatanodeCommandCount(dn,
+          Type.deleteBlocksCommand) < deleteBlocksPendingCommandLimit) {
+        included.add(dn);
+      }
+    }
+    return included;
+  }
 }
diff --git a/...erver-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestSCMBlockDeletingService.java b/...erver-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestSCMBlockDeletingService.java
@@ -0,0 +1,159 @@
+package org.apache.hadoop.hdds.scm.block;
+
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type;
+import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.DeletedBlocksTransaction;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
+import org.apache.hadoop.hdds.scm.ha.SCMContext;
+import org.apache.hadoop.hdds.scm.ha.SCMServiceManager;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
+import org.apache.hadoop.hdds.scm.node.NodeStatus;
+import org.apache.hadoop.hdds.server.events.EventPublisher;
+import org.apache.hadoop.ozone.protocol.commands.CommandForDatanode;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.ArgumentCaptor;
+
+import java.time.Clock;
+import java.time.ZoneOffset;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.stream.Collectors;
+
+import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT;
+import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT_DEFAULT;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyInt;
+import static org.mockito.ArgumentMatchers.anySet;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+/**
+ * Test SCMBlockDeletingService.
+ */
+public class TestSCMBlockDeletingService {
+  private SCMBlockDeletingService service;
+  private EventPublisher eventPublisher;
+  private List<DatanodeDetails> datanodeDetails;
+  private OzoneConfiguration conf;
+  private NodeManager nodeManager;
+  private ScmBlockDeletingServiceMetrics metrics;
+
+  @BeforeEach
+  public void setup() throws Exception {
+    nodeManager = mock(NodeManager.class);
+    eventPublisher = mock(EventPublisher.class);
+    conf = new OzoneConfiguration();
+    metrics = ScmBlockDeletingServiceMetrics.create();
+    when(nodeManager.getTotalDatanodeCommandCount(any(),
+        any())).thenReturn(0);
+    SCMServiceManager scmServiceManager = mock(SCMServiceManager.class);
+    SCMContext scmContext = mock(SCMContext.class);
+
+    DatanodeDeletedBlockTransactions ddbt =
+        new DatanodeDeletedBlockTransactions();
+    DatanodeDetails datanode1 = MockDatanodeDetails.randomDatanodeDetails();
+    DatanodeDetails datanode2 = MockDatanodeDetails.randomDatanodeDetails();
+    DatanodeDetails datanode3 = MockDatanodeDetails.randomDatanodeDetails();
+    datanodeDetails = Arrays.asList(datanode1, datanode2, datanode3);
+    when(nodeManager.getNodes(NodeStatus.inServiceHealthy())).thenReturn(
+        datanodeDetails);
+    DeletedBlocksTransaction tx1 = createTestDeleteTxn(1, Arrays.asList(1L), 1);
+    ddbt.addTransactionToDN(datanode1.getUuid(), tx1);
+    ddbt.addTransactionToDN(datanode2.getUuid(), tx1);
+    ddbt.addTransactionToDN(datanode3.getUuid(), tx1);
+    DeletedBlockLog mockDeletedBlockLog = mock(DeletedBlockLog.class);
+    when(mockDeletedBlockLog.getTransactions(
+        anyInt(), anySet())).thenReturn(ddbt);
+
+    service = spy(new SCMBlockDeletingService(
+        mockDeletedBlockLog, nodeManager, eventPublisher, scmContext,
+        scmServiceManager, conf, metrics, Clock.system(
+        ZoneOffset.UTC)));
+    when(service.shouldRun()).thenReturn(true);
+  }
+
+  @AfterEach
+  public void stop() {
+    service.stop();
+    ScmBlockDeletingServiceMetrics.unRegister();
+  }
+
+  @Test
+  public void testCall() throws Exception {
+    callDeletedBlockTransactionScanner();
+
+    ArgumentCaptor<CommandForDatanode> argumentCaptor =
+        ArgumentCaptor.forClass(CommandForDatanode.class);
+
+    // Three Datanode is healthy and in-service, and the task queue is empty,
+    // so the transaction will send to all three Datanode
+    verify(eventPublisher, times(3)).fireEvent(
+        eq(SCMEvents.DATANODE_COMMAND), argumentCaptor.capture());
+    List<CommandForDatanode> actualCommands = argumentCaptor.getAllValues();
+    List<UUID> actualDnIds = actualCommands.stream()
+        .map(CommandForDatanode::getDatanodeId)
+        .collect(Collectors.toList());
+    Set<UUID> expectedDnIdsSet = datanodeDetails.stream()
+        .map(DatanodeDetails::getUuid).collect(Collectors.toSet());
+
+    assertEquals(expectedDnIdsSet, new HashSet<>(actualDnIds));
+    assertEquals(datanodeDetails.size(),
+        metrics.getNumBlockDeletionCommandSent());
+    // Echo Command has one Transaction
+    assertEquals(datanodeDetails.size() * 1,
+        metrics.getNumBlockDeletionTransactionSent());
+  }
+
+  private void callDeletedBlockTransactionScanner() throws Exception {
+    service.getTasks().poll().call();
+  }
+
+  @Test
+  public void testLimitCommandSending() throws Exception {
+    int pendingCommandLimit = conf.getInt(
+        OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT,
+        OZONE_BLOCK_DELETING_PENDING_COMMAND_LIMIT_DEFAULT);
+
+
+    // The number of commands pending on all Datanodes has reached the limit.
+    when(nodeManager.getTotalDatanodeCommandCount(any(),
+        any())).thenReturn(pendingCommandLimit);
+    assertEquals(0,
+        service.getDatanodesWithinCommandLimit(datanodeDetails).size());
+
+    // The number of commands pending on all Datanodes is 0
+    when(nodeManager.getTotalDatanodeCommandCount(any(),
+        any())).thenReturn(0);
+    assertEquals(datanodeDetails.size(),
+        service.getDatanodesWithinCommandLimit(datanodeDetails).size());
+
+    // The number of commands pending on first Datanodes has reached the limit.
+    DatanodeDetails fullDatanode = datanodeDetails.get(0);
+    when(nodeManager.getTotalDatanodeCommandCount(fullDatanode,
+        Type.deleteBlocksCommand)).thenReturn(pendingCommandLimit);
+    Set<DatanodeDetails> includeNodes =
+        service.getDatanodesWithinCommandLimit(datanodeDetails);
+    assertEquals(datanodeDetails.size() - 1,
+        includeNodes.size());
+    assertFalse(includeNodes.contains(fullDatanode));
+  }
+
+  private DeletedBlocksTransaction createTestDeleteTxn(
+      long txnID, List<Long> blocks, long containerID) {
+    return DeletedBlocksTransaction.newBuilder().setTxID(txnID)
+        .setContainerID(containerID).addAllLocalID(blocks).setCount(0).build();
+  }
+}