From efeb98a17f493767062d266138298adc113e32d1 Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Tue, 10 Dec 2024 14:04:47 +0530 Subject: [PATCH 1/3] HDDS-11779. Add DN metrics to show deletion progress --- .../helpers/BlockDeletingServiceMetrics.java | 66 +++++++++++++++++-- .../DeleteBlocksCommandHandler.java | 4 ++ .../TestDeleteBlocksCommandHandler.java | 4 ++ 3 files changed, 69 insertions(+), 5 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java index 7487f757fe5..ceb8b88d189 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java @@ -46,6 +46,9 @@ public final class BlockDeletingServiceMetrics { @Metric(about = "The number of failed delete blocks.") private MutableCounterLong failureCount; + @Metric(about = "The number of delete block transaction processed.") + private MutableCounterLong processedTransactionCount; + @Metric(about = "The number of out of order delete block transaction.") private MutableCounterLong outOfOrderDeleteBlockTransactionCount; @@ -55,8 +58,7 @@ public final class BlockDeletingServiceMetrics { @Metric(about = "The total number of DeleteBlockTransaction received") private MutableCounterLong receivedTransactionCount; - @Metric(about = "The total number of DeleteBlockTransaction" + - " that is a retry Transaction") + @Metric(about = "The total number of DeleteBlockTransaction that is a retry Transaction") private MutableCounterLong receivedRetryTransactionCount; @Metric(about = "The total number of Container received to be processed") @@ -74,10 +76,20 @@ public final class BlockDeletingServiceMetrics { @Metric(about = "The total number of Container chosen to be deleted.") private MutableGaugeLong totalContainerChosenCount; - @Metric(about = "The total number of transactions which failed due" + - " to container lock wait timeout.") + @Metric(about = "The total number of transactions which failed due to container lock wait timeout.") private MutableGaugeLong totalLockTimeoutTransactionCount; + @Metric(about = "The total number of deletion commands received.") + private MutableGaugeLong totalCommandsReceived; + + @Metric(about = "The total number of deletion commands that were discarded " + + "due to the queue being full.") + private MutableGaugeLong totalCommandsDiscarded; + + @Metric(about = "The total number of deletion transactions that were discarded " + + "due to the transaction being a duplicate.") + private MutableGaugeLong totalTransactionsDiscarded; + private BlockDeletingServiceMetrics() { } @@ -112,6 +124,10 @@ public void incrFailureCount() { this.failureCount.incr(); } + public void incrProcessedTransactionCount(long count) { + processedTransactionCount.incr(count); + } + public void incrReceivedTransactionCount(long count) { receivedTransactionCount.incr(count); } @@ -148,6 +164,18 @@ public void incrTotalLockTimeoutTransactionCount() { totalLockTimeoutTransactionCount.incr(); } + public void incrTotalCommandsReceived(long delta) { + this.totalCommandsReceived.incr(delta); + } + + public void incrTotalCommandsDiscarded(long delta) { + this.totalCommandsDiscarded.incr(delta); + } + + public void incrTotalTransactionsDiscarded(long delta) { + this.totalTransactionsDiscarded.incr(delta); + } + public long getSuccessCount() { return successCount.value(); } @@ -184,6 +212,26 @@ public long getTotalLockTimeoutTransactionCount() { return totalLockTimeoutTransactionCount.value(); } + public long getProcessedTransactionCount() { + return processedTransactionCount.value(); + } + + public long getReceivedTransactionCount() { + return receivedTransactionCount.value(); + } + + public long getTotalCommandsReceived() { + return totalCommandsReceived.value(); + } + + public long getTotalCommandsDiscarded() { + return totalCommandsDiscarded.value(); + } + + public long getTotalTransactionsDiscarded() { + return totalTransactionsDiscarded.value(); + } + @Override public String toString() { StringBuffer buffer = new StringBuffer(); @@ -202,6 +250,8 @@ public String toString() { + receivedTransactionCount.value()).append("\t") .append("receivedRetryTransactionCount = " + receivedRetryTransactionCount.value()).append("\t") + .append("processedTransactionCount = " + + processedTransactionCount.value()).append("\t") .append("receivedContainerCount = " + receivedContainerCount.value()).append("\t") .append("receivedBlockCount = " @@ -209,7 +259,13 @@ public String toString() { .append("markedBlockCount = " + markedBlockCount.value()).append("\t") .append("totalLockTimeoutTransactionCount = " - + totalLockTimeoutTransactionCount.value()).append("\t"); + + totalLockTimeoutTransactionCount.value()).append("\t") + .append("totalCommandsReceived = " + + totalCommandsReceived.value()).append("\t") + .append("totalCommandsDiscarded = " + + totalCommandsDiscarded.value()).append("\t") + .append("totalTransactionsDiscarded = " + + totalTransactionsDiscarded.value()).append("\t"); return buffer.toString(); } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java index 136c5805821..ef0787db7c4 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java @@ -144,6 +144,7 @@ public void handle(SCMCommand command, OzoneContainer container, container, context, connectionManager); try { deleteCommandQueues.add(cmd); + blockDeleteMetrics.incrTotalCommandsReceived(1); } catch (IllegalStateException e) { String dnId = context.getParent().getDatanodeDetails().getUuidString(); Consumer updateFailure = (cmdStatus) -> { @@ -157,6 +158,7 @@ public void handle(SCMCommand command, OzoneContainer container, }; updateCommandStatus(cmd.getContext(), cmd.getCmd(), updateFailure, LOG); LOG.warn("Command is discarded because of the command queue is full"); + blockDeleteMetrics.incrTotalCommandsDiscarded(1); } } @@ -462,6 +464,7 @@ public List> submitTasks( Future future = executor.submit(new ProcessTransactionTask(tx)); futures.add(future); + blockDeleteMetrics.incrProcessedTransactionCount(1); } return futures; } @@ -650,6 +653,7 @@ public static boolean isDuplicateTransaction(long containerId, KeyValueContainer containerData.getDeleteTransactionId())); } else if (delTX.getTxID() == containerData.getDeleteTransactionId()) { duplicate = true; + metrics.incrTotalTransactionsDiscarded(1); LOG.info(String.format("Delete blocks with txID %d for containerId: %d" + " is retried.", delTX.getTxID(), containerId)); } else { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java index dcabad46ac5..bad6409a802 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java @@ -331,6 +331,9 @@ public void testDeleteBlockCommandHandleWhenDeleteCommandQueuesFull() assertEquals(cmdStatus.getProtoBufMessage().getBlockDeletionAck().getResultsCount(), 0); } } + blockDeleteMetrics = handler.getBlockDeleteMetrics(); + assertEquals(5, blockDeleteMetrics.getTotalCommandsReceived()); + assertEquals(2, blockDeleteMetrics.getTotalCommandsDiscarded()); } @ContainerTestVersionInfo.ContainerTest @@ -367,6 +370,7 @@ public void testDuplicateDeleteBlocksCommand( assertTrue(results3.get(0).getSuccess()); assertEquals(0, blockDeleteMetrics.getTotalLockTimeoutTransactionCount()); + assertEquals(1, blockDeleteMetrics.getTotalTransactionsDiscarded()); // Duplicate cmd content will not be persisted. assertEquals(2, ((KeyValueContainerData) container.getContainerData()).getNumPendingDeletionBlocks()); From 9105eb6b23a1fb4ed8e38cfd4075fddd66608afc Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Mon, 16 Dec 2024 14:00:10 +0530 Subject: [PATCH 2/3] Remove command metrics --- .../helpers/BlockDeletingServiceMetrics.java | 66 ++++++------------- .../DeleteBlocksCommandHandler.java | 12 ++-- .../TestDeleteBlocksCommandHandler.java | 3 - 3 files changed, 26 insertions(+), 55 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java index ceb8b88d189..3d4ca64cbc1 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java @@ -46,9 +46,6 @@ public final class BlockDeletingServiceMetrics { @Metric(about = "The number of failed delete blocks.") private MutableCounterLong failureCount; - @Metric(about = "The number of delete block transaction processed.") - private MutableCounterLong processedTransactionCount; - @Metric(about = "The number of out of order delete block transaction.") private MutableCounterLong outOfOrderDeleteBlockTransactionCount; @@ -79,16 +76,11 @@ public final class BlockDeletingServiceMetrics { @Metric(about = "The total number of transactions which failed due to container lock wait timeout.") private MutableGaugeLong totalLockTimeoutTransactionCount; - @Metric(about = "The total number of deletion commands received.") - private MutableGaugeLong totalCommandsReceived; - - @Metric(about = "The total number of deletion commands that were discarded " + - "due to the queue being full.") - private MutableGaugeLong totalCommandsDiscarded; + @Metric(about = "The number of delete block transactions successful.") + private MutableCounterLong processedTransactionSuccessCount; - @Metric(about = "The total number of deletion transactions that were discarded " + - "due to the transaction being a duplicate.") - private MutableGaugeLong totalTransactionsDiscarded; + @Metric(about = "The number of delete block transactions failed.") + private MutableGaugeLong processedTransactionFailCount; private BlockDeletingServiceMetrics() { } @@ -124,8 +116,12 @@ public void incrFailureCount() { this.failureCount.incr(); } - public void incrProcessedTransactionCount(long count) { - processedTransactionCount.incr(count); + public void incrProcessedTransactionSuccessCount(long count) { + processedTransactionSuccessCount.incr(count); + } + + public void incrProcessedTransactionFailCount(long count) { + processedTransactionFailCount.incr(count); } public void incrReceivedTransactionCount(long count) { @@ -164,18 +160,6 @@ public void incrTotalLockTimeoutTransactionCount() { totalLockTimeoutTransactionCount.incr(); } - public void incrTotalCommandsReceived(long delta) { - this.totalCommandsReceived.incr(delta); - } - - public void incrTotalCommandsDiscarded(long delta) { - this.totalCommandsDiscarded.incr(delta); - } - - public void incrTotalTransactionsDiscarded(long delta) { - this.totalTransactionsDiscarded.incr(delta); - } - public long getSuccessCount() { return successCount.value(); } @@ -212,24 +196,16 @@ public long getTotalLockTimeoutTransactionCount() { return totalLockTimeoutTransactionCount.value(); } - public long getProcessedTransactionCount() { - return processedTransactionCount.value(); - } - public long getReceivedTransactionCount() { return receivedTransactionCount.value(); } - public long getTotalCommandsReceived() { - return totalCommandsReceived.value(); - } - - public long getTotalCommandsDiscarded() { - return totalCommandsDiscarded.value(); + public long getProcessedTransactionSuccessCount() { + return processedTransactionSuccessCount.value(); } - public long getTotalTransactionsDiscarded() { - return totalTransactionsDiscarded.value(); + public long getProcessedTransactionFailCount() { + return processedTransactionFailCount.value(); } @Override @@ -250,8 +226,10 @@ public String toString() { + receivedTransactionCount.value()).append("\t") .append("receivedRetryTransactionCount = " + receivedRetryTransactionCount.value()).append("\t") - .append("processedTransactionCount = " - + processedTransactionCount.value()).append("\t") + .append("processedTransactionSuccessCount = " + + processedTransactionSuccessCount.value()).append("\t") + .append("processedTransactionFailCount = " + + processedTransactionFailCount.value()).append("\t") .append("receivedContainerCount = " + receivedContainerCount.value()).append("\t") .append("receivedBlockCount = " @@ -259,13 +237,7 @@ public String toString() { .append("markedBlockCount = " + markedBlockCount.value()).append("\t") .append("totalLockTimeoutTransactionCount = " - + totalLockTimeoutTransactionCount.value()).append("\t") - .append("totalCommandsReceived = " - + totalCommandsReceived.value()).append("\t") - .append("totalCommandsDiscarded = " - + totalCommandsDiscarded.value()).append("\t") - .append("totalTransactionsDiscarded = " - + totalTransactionsDiscarded.value()).append("\t"); + + totalLockTimeoutTransactionCount.value()).append("\t"); return buffer.toString(); } } diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java index ef0787db7c4..6a158f51023 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/DeleteBlocksCommandHandler.java @@ -144,7 +144,6 @@ public void handle(SCMCommand command, OzoneContainer container, container, context, connectionManager); try { deleteCommandQueues.add(cmd); - blockDeleteMetrics.incrTotalCommandsReceived(1); } catch (IllegalStateException e) { String dnId = context.getParent().getDatanodeDetails().getUuidString(); Consumer updateFailure = (cmdStatus) -> { @@ -158,7 +157,6 @@ public void handle(SCMCommand command, OzoneContainer container, }; updateCommandStatus(cmd.getContext(), cmd.getCmd(), updateFailure, LOG); LOG.warn("Command is discarded because of the command queue is full"); - blockDeleteMetrics.incrTotalCommandsDiscarded(1); } } @@ -392,8 +390,14 @@ private void processCmd(DeleteCmdInfo cmd) { LOG.debug("Sending following block deletion ACK to SCM"); for (DeleteBlockTransactionResult result : blockDeletionACK .getResultsList()) { + boolean success = result.getSuccess(); LOG.debug("TxId = {} : ContainerId = {} : {}", - result.getTxID(), result.getContainerID(), result.getSuccess()); + result.getTxID(), result.getContainerID(), success); + if (success) { + blockDeleteMetrics.incrProcessedTransactionSuccessCount(1); + } else { + blockDeleteMetrics.incrProcessedTransactionFailCount(1); + } } } } @@ -464,7 +468,6 @@ public List> submitTasks( Future future = executor.submit(new ProcessTransactionTask(tx)); futures.add(future); - blockDeleteMetrics.incrProcessedTransactionCount(1); } return futures; } @@ -653,7 +656,6 @@ public static boolean isDuplicateTransaction(long containerId, KeyValueContainer containerData.getDeleteTransactionId())); } else if (delTX.getTxID() == containerData.getDeleteTransactionId()) { duplicate = true; - metrics.incrTotalTransactionsDiscarded(1); LOG.info(String.format("Delete blocks with txID %d for containerId: %d" + " is retried.", delTX.getTxID(), containerId)); } else { diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java index bad6409a802..3049bbb4e1e 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java @@ -332,8 +332,6 @@ public void testDeleteBlockCommandHandleWhenDeleteCommandQueuesFull() } } blockDeleteMetrics = handler.getBlockDeleteMetrics(); - assertEquals(5, blockDeleteMetrics.getTotalCommandsReceived()); - assertEquals(2, blockDeleteMetrics.getTotalCommandsDiscarded()); } @ContainerTestVersionInfo.ContainerTest @@ -370,7 +368,6 @@ public void testDuplicateDeleteBlocksCommand( assertTrue(results3.get(0).getSuccess()); assertEquals(0, blockDeleteMetrics.getTotalLockTimeoutTransactionCount()); - assertEquals(1, blockDeleteMetrics.getTotalTransactionsDiscarded()); // Duplicate cmd content will not be persisted. assertEquals(2, ((KeyValueContainerData) container.getContainerData()).getNumPendingDeletionBlocks()); From 1a544b49a95e2020005c49a0eb9434cbde46081b Mon Sep 17 00:00:00 2001 From: tejaskriya Date: Mon, 16 Dec 2024 14:10:32 +0530 Subject: [PATCH 3/3] Clean up unused code --- .../container/common/helpers/BlockDeletingServiceMetrics.java | 4 ---- .../commandhandler/TestDeleteBlocksCommandHandler.java | 1 - 2 files changed, 5 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java index 3d4ca64cbc1..80c390f3b83 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/helpers/BlockDeletingServiceMetrics.java @@ -196,10 +196,6 @@ public long getTotalLockTimeoutTransactionCount() { return totalLockTimeoutTransactionCount.value(); } - public long getReceivedTransactionCount() { - return receivedTransactionCount.value(); - } - public long getProcessedTransactionSuccessCount() { return processedTransactionSuccessCount.value(); } diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java index 3049bbb4e1e..dcabad46ac5 100644 --- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java +++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestDeleteBlocksCommandHandler.java @@ -331,7 +331,6 @@ public void testDeleteBlockCommandHandleWhenDeleteCommandQueuesFull() assertEquals(cmdStatus.getProtoBufMessage().getBlockDeletionAck().getResultsCount(), 0); } } - blockDeleteMetrics = handler.getBlockDeleteMetrics(); } @ContainerTestVersionInfo.ContainerTest