From a6f60ebed2de7b75f806c55d251baeb9f9f19bdc Mon Sep 17 00:00:00 2001 From: Shashikant Banerjee Date: Mon, 5 Aug 2019 13:50:47 +0530 Subject: [PATCH 1/5] HDDS-1610. applyTransaction failure should not be lost on restart. --- .../server/ratis/ContainerStateMachine.java | 50 +++++++++--- .../server/ratis/XceiverServerRatis.java | 14 ++++ .../TestContainerStateMachineFailures.java | 80 ++++++++++++++++++- 3 files changed, 130 insertions(+), 14 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index f4d4744d5a688..e2fbc7c60a67b 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -674,30 +674,54 @@ public CompletableFuture applyTransaction(TransactionContext trx) { if (cmdType == Type.WriteChunk || cmdType ==Type.PutSmallFile) { builder.setCreateContainerSet(createContainerSet); } + CompletableFuture applyTransactionFuture = + new CompletableFuture<>(); // Ensure the command gets executed in a separate thread than // stateMachineUpdater thread which is calling applyTransaction here. - CompletableFuture future = CompletableFuture - .supplyAsync(() -> runCommand(requestProto, builder.build()), + CompletableFuture future = + CompletableFuture.supplyAsync( + () -> runCommandGetResponse(requestProto, builder.build()), getCommandExecutor(requestProto)); - - future.thenAccept(m -> { + future.thenApply(r -> { if (trx.getServerRole() == RaftPeerRole.LEADER) { long startTime = (long) trx.getStateMachineContext(); metrics.incPipelineLatency(cmdType, Time.monotonicNowNanos() - startTime); } - - final Long previous = - applyTransactionCompletionMap - .put(index, trx.getLogEntry().getTerm()); - Preconditions.checkState(previous == null); - if (cmdType == Type.WriteChunk || cmdType == Type.PutSmallFile) { - metrics.incNumBytesCommittedCount( + if (r.getResult() != ContainerProtos.Result.SUCCESS) { + StorageContainerException sce = + new StorageContainerException(r.getMessage(), r.getResult()); + LOG.error(gid + ": ApplyTransaction failed: cmd " + r.getCmdType() + + " logIndex " + index + " Error message: " + r.getMessage() + + " Container Result: " + r.getResult()); + metrics.incNumApplyTransactionsFails(); + ratisServer.handleApplyTransactionFailure(gid, trx.getServerRole()); + // Since the applyTransaction now is completed exceptionally, + // before any further snapshot is taken , the exception will be + // caught in stateMachineUpdater in Ratis and ratis server will + // shutdown. + applyTransactionFuture.completeExceptionally(sce); + } else { + metrics.incNumBytesWrittenCount( requestProto.getWriteChunk().getChunkData().getLen()); + LOG.debug(gid + ": ApplyTransaction completed: cmd " + r.getCmdType() + + " logIndex " + index + " Error message: " + r.getMessage() + + " Container Result: " + r.getResult()); + applyTransactionFuture.complete(r::toByteString); + if (cmdType == Type.WriteChunk || cmdType == Type.PutSmallFile) { + metrics.incNumBytesCommittedCount( + requestProto.getWriteChunk().getChunkData().getLen()); + } } + + final Long previous = applyTransactionCompletionMap + .put(index, trx.getLogEntry().getTerm()); + Preconditions.checkState(previous == null); updateLastApplied(); - }).whenComplete((r, t) -> applyTransactionSemaphore.release()); - return future; + applyTransactionSemaphore.release(); + return applyTransactionFuture; + }); + return applyTransactionFuture; } catch (IOException | InterruptedException e) { metrics.incNumApplyTransactionsFails(); return completeExceptionally(e); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java index 3a8b79b5bc124..eaba440e7e222 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java @@ -609,6 +609,16 @@ void handleNoLeader(RaftGroupId groupId, RoleInfoProto roleInfoProto) { handlePipelineFailure(groupId, roleInfoProto); } + void handleApplyTransactionFailure(RaftGroupId groupId, + RaftProtos.RaftPeerRole role) { + UUID dnId = RatisHelper.toDatanodeId(getServer().getId()); + String msg = + "Ratis Transaction failure in datanode" + dnId + " with role " + role + + " Triggering pipeline close action."; + triggerPipelineClose(groupId, msg, ClosePipelineInfo.Reason.PIPELINE_FAILED, + false); + stop(); + } /** * The fact that the snapshot contents cannot be used to actually catch up * the follower, it is the reason to initiate close pipeline and @@ -630,6 +640,10 @@ void handleInstallSnapshotFromLeader(RaftGroupId groupId, handlePipelineFailure(groupId, roleInfoProto); } + @VisibleForTesting + public boolean isClosed() { + return !isStarted; + } /** * Notify the Datanode Ratis endpoint of Ratis log failure. * Expected to be invoked from the Container StateMachine diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index 469eeb0adee9d..5a3bcf0b6517c 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -23,6 +23,11 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.hdds.scm.XceiverClientManager; +import org.apache.hadoop.hdds.scm.XceiverClientSpi; +import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.hdds.scm.pipeline.Pipeline; +import org.apache.hadoop.hdds.scm.pipeline.PipelineNotFoundException; import org.apache.hadoop.ozone.MiniOzoneCluster; import org.apache.hadoop.ozone.OzoneConsts; import org.apache.hadoop.ozone.client.ObjectStore; @@ -34,11 +39,14 @@ import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml; import org.apache.hadoop.ozone.container.common.impl.HddsDispatcher; +import org.apache.hadoop.ozone.container.common.transport.server.ratis.XceiverServerRatis; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; import org.apache.hadoop.ozone.om.helpers.OmKeyArgs; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.ratis.protocol.StateMachineException; +import org.apache.ratis.util.LifeCycle; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; @@ -77,7 +85,7 @@ public class TestContainerStateMachineFailures { private static String volumeName; private static String bucketName; private static String path; - private static int chunkSize; + private static XceiverClientManager xceiverClientManager; /** * Create a MiniDFSCluster for testing. @@ -109,6 +117,7 @@ public static void init() throws Exception { //the easiest way to create an open container is creating a key client = OzoneClientFactory.getClient(conf); objectStore = client.getObjectStore(); + xceiverClientManager = new XceiverClientManager(conf); volumeName = "testcontainerstatemachinefailures"; bucketName = volumeName; objectStore.createVolume(volumeName); @@ -270,4 +279,73 @@ public void testUnhealthyContainer() throws Exception { Assert.assertEquals(ContainerProtos.Result.CONTAINER_UNHEALTHY, dispatcher.dispatch(request.build(), null).getResult()); } + + @Test + public void testAppyTransactionFailure() throws Exception { + OzoneOutputStream key = + objectStore.getVolume(volumeName).getBucket(bucketName) + .createKey("ratis", 1024, ReplicationType.RATIS, + ReplicationFactor.ONE, new HashMap<>()); + // First write and flush creates a container in the datanode + key.write("ratis".getBytes()); + key.flush(); + key.write("ratis".getBytes()); + + //get the name of a valid container + OmKeyArgs keyArgs = new OmKeyArgs.Builder().setVolumeName(volumeName). + setBucketName(bucketName).setType(HddsProtos.ReplicationType.RATIS) + .setFactor(HddsProtos.ReplicationFactor.ONE).setKeyName("ratis") + .build(); + KeyOutputStream groupOutputStream = (KeyOutputStream) key.getOutputStream(); + List locationInfoList = + groupOutputStream.getLocationInfoList(); + Assert.assertEquals(1, locationInfoList.size()); + OmKeyLocationInfo omKeyLocationInfo = locationInfoList.get(0); + ContainerData containerData = + cluster.getHddsDatanodes().get(0).getDatanodeStateMachine() + .getContainer().getContainerSet() + .getContainer(omKeyLocationInfo.getContainerID()) + .getContainerData(); + Assert.assertTrue(containerData instanceof KeyValueContainerData); + KeyValueContainerData keyValueContainerData = + (KeyValueContainerData) containerData; + key.close(); + + long containerID = omKeyLocationInfo.getContainerID(); + // delete the container db file + FileUtil.fullyDelete(new File(keyValueContainerData.getContainerPath())); + Pipeline pipeline = cluster.getStorageContainerLocationClient() + .getContainerWithPipeline(containerID).getPipeline(); + XceiverClientSpi client = xceiverClientManager.acquireClient(pipeline); + ContainerProtos.ContainerCommandRequestProto.Builder request = + ContainerProtos.ContainerCommandRequestProto.newBuilder(); + request.setDatanodeUuid(pipeline.getFirstNode().getUuidString()); + request.setCmdType(ContainerProtos.Type.CloseContainer); + request.setContainerID(containerID); + request.setCloseContainer( + ContainerProtos.CloseContainerRequestProto.getDefaultInstance()); + // close container transaction will fail over Ratis and will cause the raft + try { + client.sendCommand(request.build()); + Assert.fail("Expected exception not thrown"); + } catch (IOException e) { + } + + // Make sure the container is marked unhealthy + Assert.assertTrue( + cluster.getHddsDatanodes().get(0).getDatanodeStateMachine() + .getContainer().getContainerSet().getContainer(containerID) + .getContainerState() + == ContainerProtos.ContainerDataProto.State.UNHEALTHY); + XceiverServerRatis raftServer = (XceiverServerRatis) + cluster.getHddsDatanodes().get(0).getDatanodeStateMachine() + .getContainer().getWriteChannel(); + Assert.assertTrue(raftServer.isClosed()); + try { + cluster.getStorageContainerManager().getPipelineManager() + .getPipeline(pipeline.getId()); + Assert.fail("Expected exception not thrown"); + } catch(PipelineNotFoundException e) { + } + } } \ No newline at end of file From 529a05bba903dc3ede7214bfe96a11d60e0474d3 Mon Sep 17 00:00:00 2001 From: Shashikant Banerjee Date: Tue, 13 Aug 2019 15:54:59 +0530 Subject: [PATCH 2/5] Addressed review comments. --- .../server/ratis/ContainerStateMachine.java | 63 +++++++++++-------- .../server/ratis/XceiverServerRatis.java | 13 ++-- .../StorageContainerDatanodeProtocol.proto | 1 + .../TestContainerStateMachineFailures.java | 56 +++++++++++------ .../ozone/container/ContainerTestHelper.java | 16 +++++ .../TestFreonWithDatanodeFastRestart.java | 10 +-- 6 files changed, 99 insertions(+), 60 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index e2fbc7c60a67b..bc3d56c7a2da6 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -34,6 +34,7 @@ import org.apache.hadoop.util.Time; import org.apache.ratis.proto.RaftProtos.RaftPeerRole; import org.apache.ratis.protocol.RaftGroupId; +import org.apache.ratis.protocol.StateMachineException; import org.apache.ratis.server.RaftServer; import org.apache.ratis.server.impl.RaftServerProxy; import org.apache.ratis.server.protocol.TermIndex; @@ -83,6 +84,7 @@ import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.ExecutionException; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; import java.util.Set; import java.util.concurrent.ConcurrentSkipListSet; @@ -147,6 +149,7 @@ public class ContainerStateMachine extends BaseStateMachine { private final Cache stateMachineDataCache; private final boolean isBlockTokenEnabled; private final TokenVerifier tokenVerifier; + private final AtomicBoolean isStateMachineHealthy; private final Semaphore applyTransactionSemaphore; /** @@ -184,6 +187,7 @@ public ContainerStateMachine(RaftGroupId gid, ContainerDispatcher dispatcher, ScmConfigKeys. DFS_CONTAINER_RATIS_STATEMACHINE_MAX_PENDING_APPLY_TXNS_DEFAULT); applyTransactionSemaphore = new Semaphore(maxPendingApplyTransactions); + isStateMachineHealthy = new AtomicBoolean(true); this.executors = new ExecutorService[numContainerOpExecutors]; for (int i = 0; i < numContainerOpExecutors; i++) { final int index = i; @@ -265,6 +269,13 @@ public void persistContainerSet(OutputStream out) throws IOException { public long takeSnapshot() throws IOException { TermIndex ti = getLastAppliedTermIndex(); long startTime = Time.monotonicNow(); + if (!isStateMachineHealthy.get()) { + String msg = + "Failed to take snapshot " + " for " + gid + " as the stateMachine" + + " is unhealthy. The last applied index is at " + ti; + StateMachineException sme = new StateMachineException(msg); + throw sme; + } if (ti != null && ti.getIndex() != RaftLog.INVALID_LOG_INDEX) { final File snapshotFile = storage.getSnapshotFile(ti.getTerm(), ti.getIndex()); @@ -275,12 +286,12 @@ public long takeSnapshot() throws IOException { // make sure the snapshot file is synced fos.getFD().sync(); } catch (IOException ioe) { - LOG.info("{}: Failed to write snapshot at:{} file {}", gid, ti, + LOG.error("{}: Failed to write snapshot at:{} file {}", gid, ti, snapshotFile); throw ioe; } - LOG.info("{}: Finished taking a snapshot at:{} file:{} time:{}", - gid, ti, snapshotFile, (Time.monotonicNow() - startTime)); + LOG.info("{}: Finished taking a snapshot at:{} file:{} time:{}", gid, ti, + snapshotFile, (Time.monotonicNow() - startTime)); return ti.getIndex(); } return -1; @@ -385,17 +396,12 @@ private ContainerCommandResponseProto dispatchCommand( return response; } - private ContainerCommandResponseProto runCommandGetResponse( + private ContainerCommandResponseProto runCommand( ContainerCommandRequestProto requestProto, DispatcherContext context) { return dispatchCommand(requestProto, context); } - private Message runCommand(ContainerCommandRequestProto requestProto, - DispatcherContext context) { - return runCommandGetResponse(requestProto, context)::toByteString; - } - private ExecutorService getCommandExecutor( ContainerCommandRequestProto requestProto) { int executorId = (int)(requestProto.getContainerID() % executors.length); @@ -425,7 +431,7 @@ private CompletableFuture handleWriteChunk( // thread. CompletableFuture writeChunkFuture = CompletableFuture.supplyAsync(() -> - runCommandGetResponse(requestProto, context), chunkExecutor); + runCommand(requestProto, context), chunkExecutor); CompletableFuture raftFuture = new CompletableFuture<>(); @@ -502,7 +508,8 @@ public CompletableFuture query(Message request) { metrics.incNumQueryStateMachineOps(); final ContainerCommandRequestProto requestProto = getContainerCommandRequestProto(request.getContent()); - return CompletableFuture.completedFuture(runCommand(requestProto, null)); + return CompletableFuture + .completedFuture(runCommand(requestProto, null)::toByteString); } catch (IOException e) { metrics.incNumQueryStateMachineFails(); return completeExceptionally(e); @@ -680,7 +687,7 @@ public CompletableFuture applyTransaction(TransactionContext trx) { // stateMachineUpdater thread which is calling applyTransaction here. CompletableFuture future = CompletableFuture.supplyAsync( - () -> runCommandGetResponse(requestProto, builder.build()), + () -> runCommand(requestProto, builder.build()), getCommandExecutor(requestProto)); future.thenApply(r -> { if (trx.getServerRole() == RaftPeerRole.LEADER) { @@ -691,9 +698,10 @@ public CompletableFuture applyTransaction(TransactionContext trx) { if (r.getResult() != ContainerProtos.Result.SUCCESS) { StorageContainerException sce = new StorageContainerException(r.getMessage(), r.getResult()); - LOG.error(gid + ": ApplyTransaction failed: cmd " + r.getCmdType() - + " logIndex " + index + " Error message: " + r.getMessage() - + " Container Result: " + r.getResult()); + LOG.error( + "gid {} : ApplyTransaction failed. cmd {} logIndex {} msg : " + + "{} Container Result: {}", gid, r.getCmdType(), index, + r.getMessage(), r.getResult()); metrics.incNumApplyTransactionsFails(); ratisServer.handleApplyTransactionFailure(gid, trx.getServerRole()); // Since the applyTransaction now is completed exceptionally, @@ -701,26 +709,31 @@ public CompletableFuture applyTransaction(TransactionContext trx) { // caught in stateMachineUpdater in Ratis and ratis server will // shutdown. applyTransactionFuture.completeExceptionally(sce); + isStateMachineHealthy.compareAndSet(true, false); } else { metrics.incNumBytesWrittenCount( requestProto.getWriteChunk().getChunkData().getLen()); - LOG.debug(gid + ": ApplyTransaction completed: cmd " + r.getCmdType() - + " logIndex " + index + " Error message: " + r.getMessage() - + " Container Result: " + r.getResult()); + LOG.debug( + "gid {} : ApplyTransaction completed. cmd {} logIndex {} msg : " + + "{} Container Result: {}", gid, r.getCmdType(), index, + r.getMessage(), r.getResult()); applyTransactionFuture.complete(r::toByteString); if (cmdType == Type.WriteChunk || cmdType == Type.PutSmallFile) { metrics.incNumBytesCommittedCount( requestProto.getWriteChunk().getChunkData().getLen()); } + // add the entry to the applyTransactionCompletionMap only if the + // stateMachine is healthy i.e, there has been no applyTransaction + // failures before. + if (isStateMachineHealthy.get()) { + final Long previous = applyTransactionCompletionMap + .put(index, trx.getLogEntry().getTerm()); + Preconditions.checkState(previous == null); + updateLastApplied(); + } } - - final Long previous = applyTransactionCompletionMap - .put(index, trx.getLogEntry().getTerm()); - Preconditions.checkState(previous == null); - updateLastApplied(); - applyTransactionSemaphore.release(); return applyTransactionFuture; - }); + }).whenComplete((r, t) -> applyTransactionSemaphore.release()); return applyTransactionFuture; } catch (IOException | InterruptedException e) { metrics.incNumApplyTransactionsFails(); diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java index eaba440e7e222..30257a7496889 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java @@ -613,11 +613,10 @@ void handleApplyTransactionFailure(RaftGroupId groupId, RaftProtos.RaftPeerRole role) { UUID dnId = RatisHelper.toDatanodeId(getServer().getId()); String msg = - "Ratis Transaction failure in datanode" + dnId + " with role " + role - + " Triggering pipeline close action."; - triggerPipelineClose(groupId, msg, ClosePipelineInfo.Reason.PIPELINE_FAILED, - false); - stop(); + "Ratis Transaction failure in datanode " + dnId + " with role " + role + + " .Triggering pipeline close action."; + triggerPipelineClose(groupId, msg, + ClosePipelineInfo.Reason.STATEMACHINE_TRANSACTION_FAILED, true); } /** * The fact that the snapshot contents cannot be used to actually catch up @@ -640,10 +639,6 @@ void handleInstallSnapshotFromLeader(RaftGroupId groupId, handlePipelineFailure(groupId, roleInfoProto); } - @VisibleForTesting - public boolean isClosed() { - return !isStarted; - } /** * Notify the Datanode Ratis endpoint of Ratis log failure. * Expected to be invoked from the Container StateMachine diff --git a/hadoop-hdds/container-service/src/main/proto/StorageContainerDatanodeProtocol.proto b/hadoop-hdds/container-service/src/main/proto/StorageContainerDatanodeProtocol.proto index 500735a35ce42..1d09dfa902bc3 100644 --- a/hadoop-hdds/container-service/src/main/proto/StorageContainerDatanodeProtocol.proto +++ b/hadoop-hdds/container-service/src/main/proto/StorageContainerDatanodeProtocol.proto @@ -214,6 +214,7 @@ message ClosePipelineInfo { enum Reason { PIPELINE_FAILED = 1; PIPELINE_LOG_FAILED = 2; + STATEMACHINE_TRANSACTION_FAILED = 3; } required PipelineID pipelineID = 1; optional Reason reason = 3; diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index 5a3bcf0b6517c..9a0da3ef1fed5 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -25,10 +25,10 @@ import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.XceiverClientManager; import org.apache.hadoop.hdds.scm.XceiverClientSpi; -import org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException; +import org.apache.hadoop.hdds.scm.client.HddsClientUtils; import org.apache.hadoop.hdds.scm.pipeline.Pipeline; -import org.apache.hadoop.hdds.scm.pipeline.PipelineNotFoundException; import org.apache.hadoop.ozone.MiniOzoneCluster; +import org.apache.hadoop.ozone.OzoneConfigKeys; import org.apache.hadoop.ozone.OzoneConsts; import org.apache.hadoop.ozone.client.ObjectStore; import org.apache.hadoop.ozone.client.OzoneClient; @@ -36,17 +36,20 @@ import org.apache.hadoop.ozone.client.OzoneKeyDetails; import org.apache.hadoop.ozone.client.io.KeyOutputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; +import org.apache.hadoop.ozone.container.ContainerTestHelper; import org.apache.hadoop.ozone.container.common.impl.ContainerData; import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml; import org.apache.hadoop.ozone.container.common.impl.HddsDispatcher; -import org.apache.hadoop.ozone.container.common.transport.server.ratis.XceiverServerRatis; +import org.apache.hadoop.ozone.container.common.transport.server.ratis.ContainerStateMachine; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; import org.apache.hadoop.ozone.om.helpers.OmKeyArgs; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.ratis.protocol.RaftRetryFailureException; import org.apache.ratis.protocol.StateMachineException; -import org.apache.ratis.util.LifeCycle; +import org.apache.ratis.server.storage.FileInfo; +import org.apache.ratis.statemachine.impl.SimpleStateMachineStorage; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; @@ -54,6 +57,7 @@ import java.io.File; import java.io.IOException; +import java.nio.file.Path; import java.util.HashMap; import java.util.List; import java.util.concurrent.TimeUnit; @@ -110,6 +114,7 @@ public static void init() throws Exception { conf.setTimeDuration(OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, 10, TimeUnit.SECONDS); conf.setQuietMode(false); + conf.setLong(OzoneConfigKeys.DFS_RATIS_SNAPSHOT_THRESHOLD_KEY , 1); cluster = MiniOzoneCluster.newBuilder(conf).setNumDatanodes(1).setHbInterval(200) .build(); @@ -281,7 +286,7 @@ public void testUnhealthyContainer() throws Exception { } @Test - public void testAppyTransactionFailure() throws Exception { + public void testApplyTransactionFailure() throws Exception { OzoneOutputStream key = objectStore.getVolume(volumeName).getBucket(bucketName) .createKey("ratis", 1024, ReplicationType.RATIS, @@ -310,13 +315,22 @@ public void testAppyTransactionFailure() throws Exception { KeyValueContainerData keyValueContainerData = (KeyValueContainerData) containerData; key.close(); - + ContainerStateMachine stateMachine = + (ContainerStateMachine)ContainerTestHelper.getStateMachine(cluster); + SimpleStateMachineStorage storage = + (SimpleStateMachineStorage) stateMachine.getStateMachineStorage(); + Path path = storage.findLatestSnapshot().getFile().getPath(); + // Since the snapshot threshold is set to 1, since there are + // applyTransactions, we should see snapshots + Assert.assertTrue(path.getParent().toFile().listFiles().length > 0); + FileInfo snapshot = storage.findLatestSnapshot().getFile(); + Assert.assertNotNull(snapshot); long containerID = omKeyLocationInfo.getContainerID(); // delete the container db file FileUtil.fullyDelete(new File(keyValueContainerData.getContainerPath())); Pipeline pipeline = cluster.getStorageContainerLocationClient() .getContainerWithPipeline(containerID).getPipeline(); - XceiverClientSpi client = xceiverClientManager.acquireClient(pipeline); + XceiverClientSpi xceiverClient = xceiverClientManager.acquireClient(pipeline); ContainerProtos.ContainerCommandRequestProto.Builder request = ContainerProtos.ContainerCommandRequestProto.newBuilder(); request.setDatanodeUuid(pipeline.getFirstNode().getUuidString()); @@ -324,28 +338,34 @@ public void testAppyTransactionFailure() throws Exception { request.setContainerID(containerID); request.setCloseContainer( ContainerProtos.CloseContainerRequestProto.getDefaultInstance()); - // close container transaction will fail over Ratis and will cause the raft + // close container transaction will fail over Ratis and will initiate + // a pipeline close action + + // Since the applyTransaction failure is propagated to Ratis, + // stateMachineUpdater will it exception while taking the next snapshot + // and should shutdown the RaftServerImpl. The client request will fail + // with RaftRetryFailureException. try { - client.sendCommand(request.build()); + xceiverClient.sendCommand(request.build()); Assert.fail("Expected exception not thrown"); } catch (IOException e) { + Assert.assertTrue(HddsClientUtils + .checkForException(e) instanceof RaftRetryFailureException); } - // Make sure the container is marked unhealthy Assert.assertTrue( cluster.getHddsDatanodes().get(0).getDatanodeStateMachine() .getContainer().getContainerSet().getContainer(containerID) .getContainerState() == ContainerProtos.ContainerDataProto.State.UNHEALTHY); - XceiverServerRatis raftServer = (XceiverServerRatis) - cluster.getHddsDatanodes().get(0).getDatanodeStateMachine() - .getContainer().getWriteChannel(); - Assert.assertTrue(raftServer.isClosed()); try { - cluster.getStorageContainerManager().getPipelineManager() - .getPipeline(pipeline.getId()); - Assert.fail("Expected exception not thrown"); - } catch(PipelineNotFoundException e) { + // try to take a new snapshot, ideally it should just fail + stateMachine.takeSnapshot(); + } catch(IOException ioe) { + Assert.assertTrue(ioe instanceof StateMachineException); } + // Make sure the latest snapshot is same as the previous one + FileInfo latestSnapshot = storage.findLatestSnapshot().getFile(); + Assert.assertTrue(snapshot.getPath().equals(latestSnapshot.getPath())); } } \ No newline at end of file diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ContainerTestHelper.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ContainerTestHelper.java index 4da190762b01d..82d34d74e1440 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ContainerTestHelper.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/container/ContainerTestHelper.java @@ -73,6 +73,10 @@ import com.google.common.base.Preconditions; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.ratis.protocol.RaftGroupId; +import org.apache.ratis.server.impl.RaftServerImpl; +import org.apache.ratis.server.impl.RaftServerProxy; +import org.apache.ratis.statemachine.StateMachine; import org.apache.ratis.thirdparty.com.google.protobuf.ByteString; import org.junit.Assert; import org.slf4j.Logger; @@ -866,4 +870,16 @@ public static void waitForContainerClose(MiniOzoneCluster cluster, index++; } } + + public static StateMachine getStateMachine(MiniOzoneCluster cluster) + throws Exception { + XceiverServerSpi server = + cluster.getHddsDatanodes().get(0).getDatanodeStateMachine(). + getContainer().getWriteChannel(); + RaftServerProxy proxy = + (RaftServerProxy) (((XceiverServerRatis) server).getServer()); + RaftGroupId groupId = proxy.getGroupIds().iterator().next(); + RaftServerImpl impl = proxy.getImpl(groupId); + return impl.getStateMachine(); + } } diff --git a/hadoop-ozone/tools/src/test/java/org/apache/hadoop/ozone/freon/TestFreonWithDatanodeFastRestart.java b/hadoop-ozone/tools/src/test/java/org/apache/hadoop/ozone/freon/TestFreonWithDatanodeFastRestart.java index 8ed3960287130..2b4002e118a60 100644 --- a/hadoop-ozone/tools/src/test/java/org/apache/hadoop/ozone/freon/TestFreonWithDatanodeFastRestart.java +++ b/hadoop-ozone/tools/src/test/java/org/apache/hadoop/ozone/freon/TestFreonWithDatanodeFastRestart.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hdds.client.ReplicationType; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.ozone.MiniOzoneCluster; +import org.apache.hadoop.ozone.container.ContainerTestHelper; import org.apache.hadoop.ozone.container.common.transport .server.XceiverServerSpi; import org.apache.hadoop.ozone.container.common.transport.server.ratis @@ -127,13 +128,6 @@ private void startFreon() throws Exception { } private StateMachine getStateMachine() throws Exception { - XceiverServerSpi server = - cluster.getHddsDatanodes().get(0).getDatanodeStateMachine(). - getContainer().getWriteChannel(); - RaftServerProxy proxy = - (RaftServerProxy)(((XceiverServerRatis)server).getServer()); - RaftGroupId groupId = proxy.getGroupIds().iterator().next(); - RaftServerImpl impl = proxy.getImpl(groupId); - return impl.getStateMachine(); + return ContainerTestHelper.getStateMachine(cluster); } } From 412a7ff88b47590b2e3c3bd7de68b9a4e8e4e7fb Mon Sep 17 00:00:00 2001 From: Shashikant Banerjee Date: Wed, 14 Aug 2019 14:45:52 +0530 Subject: [PATCH 3/5] Addressed review comments and checkstyle issues. --- .../transport/server/ratis/ContainerStateMachine.java | 3 ++- .../client/rpc/TestContainerStateMachineFailures.java | 9 +++++---- .../ozone/freon/TestFreonWithDatanodeFastRestart.java | 7 ------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index bc3d56c7a2da6..e4e704edac7b2 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -274,6 +274,7 @@ public long takeSnapshot() throws IOException { "Failed to take snapshot " + " for " + gid + " as the stateMachine" + " is unhealthy. The last applied index is at " + ti; StateMachineException sme = new StateMachineException(msg); + LOG.error(msg); throw sme; } if (ti != null && ti.getIndex() != RaftLog.INVALID_LOG_INDEX) { @@ -703,13 +704,13 @@ public CompletableFuture applyTransaction(TransactionContext trx) { + "{} Container Result: {}", gid, r.getCmdType(), index, r.getMessage(), r.getResult()); metrics.incNumApplyTransactionsFails(); - ratisServer.handleApplyTransactionFailure(gid, trx.getServerRole()); // Since the applyTransaction now is completed exceptionally, // before any further snapshot is taken , the exception will be // caught in stateMachineUpdater in Ratis and ratis server will // shutdown. applyTransactionFuture.completeExceptionally(sce); isStateMachineHealthy.compareAndSet(true, false); + ratisServer.handleApplyTransactionFailure(gid, trx.getServerRole()); } else { metrics.incNumBytesWrittenCount( requestProto.getWriteChunk().getChunkData().getLen()); diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index 9a0da3ef1fed5..cf8e8a7eb54a8 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -114,7 +114,7 @@ public static void init() throws Exception { conf.setTimeDuration(OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, 10, TimeUnit.SECONDS); conf.setQuietMode(false); - conf.setLong(OzoneConfigKeys.DFS_RATIS_SNAPSHOT_THRESHOLD_KEY , 1); + conf.setLong(OzoneConfigKeys.DFS_RATIS_SNAPSHOT_THRESHOLD_KEY, 1); cluster = MiniOzoneCluster.newBuilder(conf).setNumDatanodes(1).setHbInterval(200) .build(); @@ -319,10 +319,10 @@ public void testApplyTransactionFailure() throws Exception { (ContainerStateMachine)ContainerTestHelper.getStateMachine(cluster); SimpleStateMachineStorage storage = (SimpleStateMachineStorage) stateMachine.getStateMachineStorage(); - Path path = storage.findLatestSnapshot().getFile().getPath(); + Path parentPath = storage.findLatestSnapshot().getFile().getPath(); // Since the snapshot threshold is set to 1, since there are // applyTransactions, we should see snapshots - Assert.assertTrue(path.getParent().toFile().listFiles().length > 0); + Assert.assertTrue(parentPath.getParent().toFile().listFiles().length > 0); FileInfo snapshot = storage.findLatestSnapshot().getFile(); Assert.assertNotNull(snapshot); long containerID = omKeyLocationInfo.getContainerID(); @@ -330,7 +330,8 @@ public void testApplyTransactionFailure() throws Exception { FileUtil.fullyDelete(new File(keyValueContainerData.getContainerPath())); Pipeline pipeline = cluster.getStorageContainerLocationClient() .getContainerWithPipeline(containerID).getPipeline(); - XceiverClientSpi xceiverClient = xceiverClientManager.acquireClient(pipeline); + XceiverClientSpi xceiverClient = + xceiverClientManager.acquireClient(pipeline); ContainerProtos.ContainerCommandRequestProto.Builder request = ContainerProtos.ContainerCommandRequestProto.newBuilder(); request.setDatanodeUuid(pipeline.getFirstNode().getUuidString()); diff --git a/hadoop-ozone/tools/src/test/java/org/apache/hadoop/ozone/freon/TestFreonWithDatanodeFastRestart.java b/hadoop-ozone/tools/src/test/java/org/apache/hadoop/ozone/freon/TestFreonWithDatanodeFastRestart.java index 2b4002e118a60..545f2b36ad8a3 100644 --- a/hadoop-ozone/tools/src/test/java/org/apache/hadoop/ozone/freon/TestFreonWithDatanodeFastRestart.java +++ b/hadoop-ozone/tools/src/test/java/org/apache/hadoop/ozone/freon/TestFreonWithDatanodeFastRestart.java @@ -23,13 +23,6 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.ozone.MiniOzoneCluster; import org.apache.hadoop.ozone.container.ContainerTestHelper; -import org.apache.hadoop.ozone.container.common.transport - .server.XceiverServerSpi; -import org.apache.hadoop.ozone.container.common.transport.server.ratis - .XceiverServerRatis; -import org.apache.ratis.protocol.RaftGroupId; -import org.apache.ratis.server.impl.RaftServerImpl; -import org.apache.ratis.server.impl.RaftServerProxy; import org.apache.ratis.server.protocol.TermIndex; import org.apache.ratis.statemachine.StateMachine; import org.apache.ratis.statemachine.impl.SimpleStateMachineStorage; From 0d301a01f88b104cc4b6ac2b73bea95c812da07d Mon Sep 17 00:00:00 2001 From: Shashikant Banerjee Date: Mon, 19 Aug 2019 14:40:42 +0530 Subject: [PATCH 4/5] Addressed related test failures --- .../TestContainerStateMachineFailures.java | 73 +++++++------------ 1 file changed, 26 insertions(+), 47 deletions(-) diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java index cf8e8a7eb54a8..86621d6b16c4d 100644 --- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java +++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java @@ -22,7 +22,6 @@ import org.apache.hadoop.hdds.client.ReplicationType; import org.apache.hadoop.hdds.conf.OzoneConfiguration; import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos; -import org.apache.hadoop.hdds.protocol.proto.HddsProtos; import org.apache.hadoop.hdds.scm.XceiverClientManager; import org.apache.hadoop.hdds.scm.XceiverClientSpi; import org.apache.hadoop.hdds.scm.client.HddsClientUtils; @@ -33,7 +32,6 @@ import org.apache.hadoop.ozone.client.ObjectStore; import org.apache.hadoop.ozone.client.OzoneClient; import org.apache.hadoop.ozone.client.OzoneClientFactory; -import org.apache.hadoop.ozone.client.OzoneKeyDetails; import org.apache.hadoop.ozone.client.io.KeyOutputStream; import org.apache.hadoop.ozone.client.io.OzoneOutputStream; import org.apache.hadoop.ozone.container.ContainerTestHelper; @@ -43,7 +41,7 @@ import org.apache.hadoop.ozone.container.common.transport.server.ratis.ContainerStateMachine; import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData; import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer; -import org.apache.hadoop.ozone.om.helpers.OmKeyArgs; +import org.apache.hadoop.ozone.om.exceptions.OMException; import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo; import org.apache.hadoop.test.GenericTestUtils; import org.apache.ratis.protocol.RaftRetryFailureException; @@ -66,7 +64,8 @@ HDDS_COMMAND_STATUS_REPORT_INTERVAL; import static org.apache.hadoop.hdds.HddsConfigKeys. HDDS_CONTAINER_REPORT_INTERVAL; -import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY; +import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos. + ContainerDataProto.State.UNHEALTHY; import static org.apache.hadoop.hdds.scm.ScmConfigKeys. HDDS_SCM_WATCHER_TIMEOUT; import static org.apache.hadoop.hdds.scm.ScmConfigKeys. @@ -113,8 +112,12 @@ public static void init() throws Exception { conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 3, TimeUnit.SECONDS); conf.setTimeDuration(OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, 10, TimeUnit.SECONDS); - conf.setQuietMode(false); + conf.setInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY, 10); + conf.setTimeDuration( + OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY, + 1, TimeUnit.SECONDS); conf.setLong(OzoneConfigKeys.DFS_RATIS_SNAPSHOT_THRESHOLD_KEY, 1); + conf.setQuietMode(false); cluster = MiniOzoneCluster.newBuilder(conf).setNumDatanodes(1).setHbInterval(200) .build(); @@ -146,19 +149,10 @@ public void testContainerStateMachineFailures() throws Exception { .createKey("ratis", 1024, ReplicationType.RATIS, ReplicationFactor.ONE, new HashMap<>()); byte[] testData = "ratis".getBytes(); - long written = 0; // First write and flush creates a container in the datanode key.write(testData); - written += testData.length; key.flush(); key.write(testData); - written += testData.length; - - //get the name of a valid container - OmKeyArgs keyArgs = new OmKeyArgs.Builder().setVolumeName(volumeName). - setBucketName(bucketName).setType(HddsProtos.ReplicationType.RATIS) - .setFactor(HddsProtos.ReplicationFactor.ONE).setKeyName("ratis") - .build(); KeyOutputStream groupOutputStream = (KeyOutputStream) key.getOutputStream(); List locationInfoList = @@ -171,7 +165,14 @@ public void testContainerStateMachineFailures() throws Exception { .getContainer().getContainerSet() .getContainer(omKeyLocationInfo.getContainerID()).getContainerData() .getContainerPath())); - key.close(); + try { + // there is only 1 datanode in the pipeline, the pipeline will be closed + // and allocation to new pipeline will fail as there is no other dn in + // the cluster + key.close(); + } catch(IOException ioe) { + Assert.assertTrue(ioe instanceof OMException); + } long containerID = omKeyLocationInfo.getContainerID(); // Make sure the container is marked unhealthy @@ -193,22 +194,6 @@ public void testContainerStateMachineFailures() throws Exception { .getDatanodeStateMachine().getContainer(); Assert .assertNull(ozoneContainer.getContainerSet().getContainer(containerID)); - - OzoneKeyDetails keyDetails = objectStore.getVolume(volumeName) - .getBucket(bucketName).getKey("ratis"); - - /** - * Ensure length of data stored in key is equal to number of bytes written. - */ - Assert.assertTrue("Number of bytes stored in the key is not equal " + - "to number of bytes written.", keyDetails.getDataSize() == written); - - /** - * Pending data from the second write should get written to a new container - * during key.close() because the first container is UNHEALTHY by that time - */ - Assert.assertTrue("Expect Key to be stored in 2 separate containers", - keyDetails.getOzoneKeyLocations().size() == 2); } @Test @@ -221,12 +206,6 @@ public void testUnhealthyContainer() throws Exception { key.write("ratis".getBytes()); key.flush(); key.write("ratis".getBytes()); - - //get the name of a valid container - OmKeyArgs keyArgs = new OmKeyArgs.Builder().setVolumeName(volumeName). - setBucketName(bucketName).setType(HddsProtos.ReplicationType.RATIS) - .setFactor(HddsProtos.ReplicationFactor.ONE).setKeyName("ratis") - .build(); KeyOutputStream groupOutputStream = (KeyOutputStream) key.getOutputStream(); List locationInfoList = groupOutputStream.getLocationInfoList(); @@ -242,8 +221,14 @@ public void testUnhealthyContainer() throws Exception { (KeyValueContainerData) containerData; // delete the container db file FileUtil.fullyDelete(new File(keyValueContainerData.getChunksPath())); - - key.close(); + try { + // there is only 1 datanode in the pipeline, the pipeline will be closed + // and allocation to new pipeline will fail as there is no other dn in + // the cluster + key.close(); + } catch(IOException ioe) { + Assert.assertTrue(ioe instanceof OMException); + } long containerID = omKeyLocationInfo.getContainerID(); @@ -295,12 +280,6 @@ public void testApplyTransactionFailure() throws Exception { key.write("ratis".getBytes()); key.flush(); key.write("ratis".getBytes()); - - //get the name of a valid container - OmKeyArgs keyArgs = new OmKeyArgs.Builder().setVolumeName(volumeName). - setBucketName(bucketName).setType(HddsProtos.ReplicationType.RATIS) - .setFactor(HddsProtos.ReplicationFactor.ONE).setKeyName("ratis") - .build(); KeyOutputStream groupOutputStream = (KeyOutputStream) key.getOutputStream(); List locationInfoList = groupOutputStream.getLocationInfoList(); @@ -316,7 +295,7 @@ public void testApplyTransactionFailure() throws Exception { (KeyValueContainerData) containerData; key.close(); ContainerStateMachine stateMachine = - (ContainerStateMachine)ContainerTestHelper.getStateMachine(cluster); + (ContainerStateMachine) ContainerTestHelper.getStateMachine(cluster); SimpleStateMachineStorage storage = (SimpleStateMachineStorage) stateMachine.getStateMachineStorage(); Path parentPath = storage.findLatestSnapshot().getFile().getPath(); @@ -362,7 +341,7 @@ public void testApplyTransactionFailure() throws Exception { try { // try to take a new snapshot, ideally it should just fail stateMachine.takeSnapshot(); - } catch(IOException ioe) { + } catch (IOException ioe) { Assert.assertTrue(ioe instanceof StateMachineException); } // Make sure the latest snapshot is same as the previous one From d559ba023952b2a99046e10661499d45c55637e3 Mon Sep 17 00:00:00 2001 From: Shashikant Banerjee Date: Mon, 19 Aug 2019 15:04:32 +0530 Subject: [PATCH 5/5] Fix the test failure related to CSMMetrics. --- .../common/transport/server/ratis/ContainerStateMachine.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java index e4e704edac7b2..aadec8dcd7ca7 100644 --- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java +++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/ContainerStateMachine.java @@ -712,8 +712,6 @@ public CompletableFuture applyTransaction(TransactionContext trx) { isStateMachineHealthy.compareAndSet(true, false); ratisServer.handleApplyTransactionFailure(gid, trx.getServerRole()); } else { - metrics.incNumBytesWrittenCount( - requestProto.getWriteChunk().getChunkData().getLen()); LOG.debug( "gid {} : ApplyTransaction completed. cmd {} logIndex {} msg : " + "{} Container Result: {}", gid, r.getCmdType(), index,