Skip to content

Commit

Permalink
HDDS-11120. Rich rebalancing status info (#6911)
Browse files Browse the repository at this point in the history
  • Loading branch information
juncevich authored Jul 22, 2024
1 parent 86c4339 commit a5e420c
Show file tree
Hide file tree
Showing 22 changed files with 849 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.scm.DatanodeAdminError;
import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerReplicaInfo;
Expand Down Expand Up @@ -384,6 +385,8 @@ StartContainerBalancerResponseProto startContainerBalancer(
*/
boolean getContainerBalancerStatus() throws IOException;

ContainerBalancerStatusInfoResponseProto getContainerBalancerStatusInfo() throws IOException;

/**
* returns the list of ratis peer roles. Currently only include peer address.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.scm.DatanodeAdminError;
import org.apache.hadoop.hdds.scm.ScmConfig;
import org.apache.hadoop.hdds.scm.ScmInfo;
Expand Down Expand Up @@ -429,6 +430,8 @@ StartContainerBalancerResponseProto startContainerBalancer(
*/
boolean getContainerBalancerStatus() throws IOException;

ContainerBalancerStatusInfoResponseProto getContainerBalancerStatusInfo() throws IOException;

/**
* Get Datanode usage information by ip or hostname or uuid.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StopContainerBalancerRequestProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ResetDeletedBlockRetryCountRequestProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoRequestProto;
import org.apache.hadoop.hdds.scm.DatanodeAdminError;
import org.apache.hadoop.hdds.scm.ScmInfo;
import org.apache.hadoop.hdds.scm.container.ContainerID;
Expand Down Expand Up @@ -1025,6 +1027,19 @@ public boolean getContainerBalancerStatus() throws IOException {

}

@Override
public ContainerBalancerStatusInfoResponseProto getContainerBalancerStatusInfo() throws IOException {

ContainerBalancerStatusInfoRequestProto request =
ContainerBalancerStatusInfoRequestProto.getDefaultInstance();
ContainerBalancerStatusInfoResponseProto response =
submitRequest(Type.GetContainerBalancerStatusInfo,
builder -> builder.setContainerBalancerStatusInfoRequest(request))
.getContainerBalancerStatusInfoResponse();
return response;

}

/**
* Builds request for datanode usage information and receives response.
*
Expand Down
35 changes: 35 additions & 0 deletions hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ message ScmContainerLocationRequest {
optional SingleNodeQueryRequestProto singleNodeQueryRequest = 45;
optional GetContainersOnDecomNodeRequestProto getContainersOnDecomNodeRequest = 46;
optional GetMetricsRequestProto getMetricsRequest = 47;
optional ContainerBalancerStatusInfoRequestProto containerBalancerStatusInfoRequest = 48;
}

message ScmContainerLocationResponse {
Expand Down Expand Up @@ -139,6 +140,7 @@ message ScmContainerLocationResponse {
optional SingleNodeQueryResponseProto singleNodeQueryResponse = 45;
optional GetContainersOnDecomNodeResponseProto getContainersOnDecomNodeResponse = 46;
optional GetMetricsResponseProto getMetricsResponse = 47;
optional ContainerBalancerStatusInfoResponseProto containerBalancerStatusInfoResponse = 48;

enum Status {
OK = 1;
Expand Down Expand Up @@ -193,6 +195,7 @@ enum Type {
SingleNodeQuery = 41;
GetContainersOnDecomNode = 42;
GetMetrics = 43;
GetContainerBalancerStatusInfo = 44;
}

/**
Expand Down Expand Up @@ -607,6 +610,38 @@ message ContainerBalancerStatusResponseProto {
required bool isRunning = 1;
}

message ContainerBalancerStatusInfoRequestProto {
optional string traceID = 1;
}

message ContainerBalancerStatusInfoResponseProto {
optional bool isRunning = 1;
optional ContainerBalancerStatusInfo containerBalancerStatusInfo = 2;
}
message ContainerBalancerStatusInfo {
optional uint64 startedAt = 1;
optional ContainerBalancerConfigurationProto configuration = 2;
repeated ContainerBalancerTaskIterationStatusInfo iterationsStatusInfo = 3;
}

message ContainerBalancerTaskIterationStatusInfo {
optional int32 iterationNumber = 1;
optional string iterationResult = 2;
optional int64 sizeScheduledForMoveGB = 3;
optional int64 dataSizeMovedGB = 4;
optional int64 containerMovesScheduled = 5;
optional int64 containerMovesCompleted = 6;
optional int64 containerMovesFailed = 7;
optional int64 containerMovesTimeout = 8;
repeated NodeTransferInfo sizeEnteringNodesGB = 9;
repeated NodeTransferInfo sizeLeavingNodesGB = 10;
}

message NodeTransferInfo {
optional string uuid = 1;
optional int64 dataVolumeGB = 2;
}

message DecommissionScmRequestProto {
required string scmId = 1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,4 +279,8 @@ NodeManager getNodeManager() {
return nodeManager;
}

@Override
public Map<DatanodeDetails, Long> getSizeEnteringNodes() {
return sizeEnteringNode;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.time.OffsetDateTime;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReentrantLock;

Expand All @@ -53,6 +54,7 @@ public class ContainerBalancer extends StatefulService {
private volatile Thread currentBalancingThread;
private volatile ContainerBalancerTask task = null;
private ReentrantLock lock;
private OffsetDateTime startedAt;

/**
* Constructs ContainerBalancer with the specified arguments. Initializes
Expand Down Expand Up @@ -175,6 +177,24 @@ public ContainerBalancerTask.Status getBalancerStatus() {
: ContainerBalancerTask.Status.STOPPED;
}

/**
* Get balancer status info.
*
* @return balancer status info if balancer started
*/
public ContainerBalancerStatusInfo getBalancerStatusInfo() throws IOException {
if (isBalancerRunning()) {
ContainerBalancerConfigurationProto configProto = readConfiguration(ContainerBalancerConfigurationProto.class);
return new ContainerBalancerStatusInfo(
this.startedAt,
configProto,
task.getCurrentIterationsStatistic()
);
} else {
return null;
}

}
/**
* Checks if ContainerBalancer is in valid state to call stop.
*
Expand Down Expand Up @@ -204,6 +224,7 @@ public String getServiceName() {
@Override
public void start() throws IllegalContainerBalancerStateException,
InvalidContainerBalancerConfigurationException {
startedAt = OffsetDateTime.now();
lock.lock();
try {
// should be leader-ready, out of safe mode, and not running already
Expand Down Expand Up @@ -251,6 +272,7 @@ public void start() throws IllegalContainerBalancerStateException,
public void startBalancer(ContainerBalancerConfiguration configuration)
throws IllegalContainerBalancerStateException,
InvalidContainerBalancerConfigurationException, IOException {
startedAt = OffsetDateTime.now();
lock.lock();
try {
// validates state, config, and then saves config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ public String toString() {
excludeNodes.equals("") ? "None" : excludeNodes);
}

ContainerBalancerConfigurationProto.Builder toProtobufBuilder() {
public ContainerBalancerConfigurationProto.Builder toProtobufBuilder() {
ContainerBalancerConfigurationProto.Builder builder =
ContainerBalancerConfigurationProto.newBuilder();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ void incrementNumContainerMovesScheduledInLatestIteration(long valueToAdd) {
this.numContainerMovesScheduledInLatestIteration.incr(valueToAdd);
}

public void resetNumContainerMovesScheduledInLatestIteration() {
numContainerMovesScheduledInLatestIteration.incr(
-getNumContainerMovesScheduledInLatestIteration());
}

/**
* Gets the amount of data moved by Container Balancer in the latest
* iteration.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hdds.scm.container.balancer;

import org.apache.hadoop.hdds.protocol.proto.HddsProtos;

import java.time.OffsetDateTime;
import java.util.List;

/**
* Info about balancer status.
*/
public class ContainerBalancerStatusInfo {
private final OffsetDateTime startedAt;
private final HddsProtos.ContainerBalancerConfigurationProto configuration;
private final List<ContainerBalancerTaskIterationStatusInfo> iterationsStatusInfo;

public ContainerBalancerStatusInfo(
OffsetDateTime startedAt,
HddsProtos.ContainerBalancerConfigurationProto configuration,
List<ContainerBalancerTaskIterationStatusInfo> iterationsStatusInfo) {
this.startedAt = startedAt;
this.configuration = configuration;
this.iterationsStatusInfo = iterationsStatusInfo;
}

public OffsetDateTime getStartedAt() {
return startedAt;
}

public HddsProtos.ContainerBalancerConfigurationProto getConfiguration() {
return configuration;
}

public List<ContainerBalancerTaskIterationStatusInfo> getIterationsStatusInfo() {
return iterationsStatusInfo;
}
}
Loading

0 comments on commit a5e420c

Please sign in to comment.