Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDDS-10231. ContainerStateManager should not finalize the OPEN containers without a Pipeline. #6123

Merged
merged 3 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,10 @@ public enum HealthState {
"OpenUnhealthyContainers"),
QUASI_CLOSED_STUCK(
"Containers QuasiClosed with insufficient datanode origins",
"StuckQuasiClosedContainers");
"StuckQuasiClosedContainers"),
OPEN_WITHOUT_PIPELINE(
"Containers in OPEN state without any healthy Pipeline",
"OpenContainersWithoutPipeline");

private String description;
private String metricName;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ void testJsonOutput() throws IOException {
assertEquals(0, stats.get("EMPTY").longValue());
assertEquals(0, stats.get("OPEN_UNHEALTHY").longValue());
assertEquals(0, stats.get("QUASI_CLOSED_STUCK").longValue());
assertEquals(0, stats.get("OPEN_WITHOUT_PIPELINE").longValue());

JsonNode samples = json.get("samples");
assertEquals(ARRAY, samples.get("UNDER_REPLICATED").getNodeType());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,16 +251,12 @@ private void initialize() throws IOException {
pipelineManager.addContainerToPipelineSCMStart(
container.getPipelineID(), container.containerID());
} catch (PipelineNotFoundException ex) {
// We are ignoring this here. The container will be moved to
// CLOSING state by ReplicationManager's OpenContainerHandler
// For more info: HDDS-10231
LOG.warn("Found container {} which is in OPEN state with " +
"pipeline {} that does not exist. Marking container for " +
"closing.", container, container.getPipelineID());
try {
updateContainerState(container.containerID().getProtobuf(),
LifeCycleEvent.FINALIZE);
} catch (InvalidStateTransitionException e) {
// This cannot happen.
LOG.warn("Unable to finalize Container {}.", container);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.node.NodeStatus;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.hdds.scm.pipeline.PipelineNotFoundException;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
Expand Down Expand Up @@ -1545,5 +1546,14 @@ private int getRemainingMaintenanceRedundancy(boolean isEC) {
private static boolean isEC(ReplicationConfig replicationConfig) {
return replicationConfig.getReplicationType() == EC;
}

public boolean hasHealthyPipeline(ContainerInfo container) {
try {
return scmContext.getScm().getPipelineManager()
.getPipeline(container.getPipelineID()) != null;
} catch (PipelineNotFoundException e) {
return false;
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,26 @@ public boolean handle(ContainerCheckRequest request) {
if (containerInfo.getState() == HddsProtos.LifeCycleState.OPEN) {
LOG.debug("Checking open container {} in OpenContainerHandler",
containerInfo);
if (!isOpenContainerHealthy(
containerInfo, request.getContainerReplicas())) {
// This is an unhealthy open container, so we need to trigger the
// close process on it.
LOG.debug("Container {} is open but unhealthy. Triggering close.",
containerInfo);
request.getReport().incrementAndSample(
ReplicationManagerReport.HealthState.OPEN_UNHEALTHY,
final boolean noPipeline = !replicationManager.hasHealthyPipeline(containerInfo);
// Minor optimization. If noPipeline is true, isOpenContainerHealthy will not
// be called.
final boolean unhealthy = noPipeline || !isOpenContainerHealthy(containerInfo,
request.getContainerReplicas());
if (unhealthy) {
// For an OPEN container, we close the container
// if the container has no Pipeline or if the container is unhealthy.
LOG.info("Container {} is open but {}. Triggering close.",
containerInfo, noPipeline ? "has no Pipeline" : "unhealthy");

request.getReport().incrementAndSample(noPipeline ?
ReplicationManagerReport.HealthState.OPEN_WITHOUT_PIPELINE :
ReplicationManagerReport.HealthState.OPEN_UNHEALTHY,
containerInfo.containerID());

if (!request.isReadOnly()) {
replicationManager
.sendCloseContainerEvent(containerInfo.containerID());
}
return true;
}
// For open containers we do not want to do any further processing in RM
// so return true to stop the command chain.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto;
import org.apache.hadoop.hdds.scm.HddsTestUtils;
import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
Expand All @@ -43,6 +44,9 @@
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.node.NodeStatus;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
import org.apache.hadoop.hdds.scm.server.StorageContainerManager;
import org.apache.hadoop.hdds.security.token.ContainerTokenGenerator;
import org.apache.hadoop.hdds.server.events.EventPublisher;
import org.apache.hadoop.ozone.protocol.commands.DeleteContainerCommand;
import org.apache.hadoop.ozone.protocol.commands.ReconstructECContainersCommand;
Expand Down Expand Up @@ -174,6 +178,16 @@ public void setup() throws IOException {
// Ensure that RM will run when asked.
when(scmContext.isLeaderReady()).thenReturn(true);
when(scmContext.isInSafeMode()).thenReturn(false);

PipelineManager pipelineManager = mock(PipelineManager.class);
when(pipelineManager.getPipeline(any()))
.thenReturn(HddsTestUtils.getRandomPipeline());

StorageContainerManager scm = mock(StorageContainerManager.class);
when(scm.getPipelineManager()).thenReturn(pipelineManager);
when(scm.getContainerTokenGenerator()).thenReturn(ContainerTokenGenerator.DISABLED);

when(scmContext.getScm()).thenReturn(scm);
}

private ReplicationManager createReplicationManager() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,20 @@
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport.HealthState;
import org.apache.hadoop.hdds.scm.container.replication.ContainerCheckRequest;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationTestUtil;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;

import java.util.Collections;
import java.util.Set;

import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSED;
import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.OPEN;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.mockito.Mockito.mock;
Expand All @@ -58,6 +61,7 @@ public void setup() {
ratisReplicationConfig = RatisReplicationConfig.getInstance(
HddsProtos.ReplicationFactor.THREE);
replicationManager = mock(ReplicationManager.class);
Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(true);
openContainerHandler = new OpenContainerHandler(replicationManager);
}

Expand Down Expand Up @@ -119,8 +123,36 @@ public void testOpenUnhealthyContainerIsClosed() {
assertTrue(openContainerHandler.handle(readRequest));
verify(replicationManager, times(1))
.sendCloseContainerEvent(containerInfo.containerID());
assertEquals(1, request.getReport().getStat(HealthState.OPEN_UNHEALTHY));
}

@Test
public void testOpenContainerWithoutPipelineIsClosed() {
Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(false);
ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
ecReplicationConfig, 1, OPEN);
Set<ContainerReplica> containerReplicas = ReplicationTestUtil
.createReplicas(containerInfo.containerID(),
ContainerReplicaProto.State.OPEN, 1, 2, 3, 4);
ContainerCheckRequest request = new ContainerCheckRequest.Builder()
.setPendingOps(Collections.emptyList())
.setReport(new ReplicationManagerReport())
.setContainerInfo(containerInfo)
.setContainerReplicas(containerReplicas)
.build();
ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder()
.setPendingOps(Collections.emptyList())
.setReport(new ReplicationManagerReport())
.setContainerInfo(containerInfo)
.setContainerReplicas(containerReplicas)
.setReadOnly(true)
.build();
assertTrue(openContainerHandler.handle(request));
assertTrue(openContainerHandler.handle(readRequest));
verify(replicationManager, times(1))
.sendCloseContainerEvent(containerInfo.containerID());
nandakumar131 marked this conversation as resolved.
Show resolved Hide resolved
assertEquals(1, request.getReport().getStat(HealthState.OPEN_WITHOUT_PIPELINE));
}
@Test
public void testClosedRatisContainerReturnsFalse() {
ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
Expand Down Expand Up @@ -178,5 +210,33 @@ public void testOpenUnhealthyRatisContainerIsClosed() {
assertTrue(openContainerHandler.handle(request));
assertTrue(openContainerHandler.handle(readRequest));
verify(replicationManager, times(1)).sendCloseContainerEvent(any());
assertEquals(1, request.getReport().getStat(HealthState.OPEN_UNHEALTHY));
}

@Test
public void testOpenRatisContainerWithoutPipelineIsClosed() {
Mockito.when(replicationManager.hasHealthyPipeline(any())).thenReturn(false);
ContainerInfo containerInfo = ReplicationTestUtil.createContainerInfo(
ratisReplicationConfig, 1, OPEN);
Set<ContainerReplica> containerReplicas = ReplicationTestUtil
.createReplicas(containerInfo.containerID(),
ContainerReplicaProto.State.OPEN, 0, 0, 0);
ContainerCheckRequest request = new ContainerCheckRequest.Builder()
.setPendingOps(Collections.emptyList())
.setReport(new ReplicationManagerReport())
.setContainerInfo(containerInfo)
.setContainerReplicas(containerReplicas)
.build();
ContainerCheckRequest readRequest = new ContainerCheckRequest.Builder()
.setPendingOps(Collections.emptyList())
.setReport(new ReplicationManagerReport())
.setContainerInfo(containerInfo)
.setContainerReplicas(containerReplicas)
.setReadOnly(true)
.build();
assertTrue(openContainerHandler.handle(request));
assertTrue(openContainerHandler.handle(readRequest));
verify(replicationManager, times(1)).sendCloseContainerEvent(any());
nandakumar131 marked this conversation as resolved.
Show resolved Hide resolved
assertEquals(1, request.getReport().getStat(HealthState.OPEN_WITHOUT_PIPELINE));
}
}