Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDDS-11120. Rich rebalancing status info #6911

Merged
merged 27 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
5bf295e
Init commit
juncevich Jul 8, 2024
9dafda8
HDDS-11120. Fix compilation
juncevich Jul 9, 2024
8d78b36
HDDS-11120. Fix checkstyle errors
juncevich Jul 9, 2024
04fb871
HDDS-11120. Fix findbugs errors
juncevich Jul 9, 2024
b25c6c0
HDDS-11120. Fix findbugs errors
juncevich Jul 9, 2024
648dfde
HDDS-11120. Fix rat check
juncevich Jul 9, 2024
7b3eb0c
HDDS-11120. Try to fix robot balancer test
juncevich Jul 9, 2024
a0e92f1
HDDS-11120. Try to fix robot balancer test
juncevich Jul 9, 2024
2f0b05e
HDDS-11120. Try to fix robot balancer test
juncevich Jul 10, 2024
a9b9864
HDDS-11120. Try to fix robot balancer test
juncevich Jul 10, 2024
1dda8b3
HDDS-11120. Try to fix robot balancer test
juncevich Jul 10, 2024
0bbe031
HDDS-11120. Fix checkstyle
juncevich Jul 10, 2024
20cf496
HDDS-11120. Try to fix robot balancer test
juncevich Jul 10, 2024
947ae3b
HDDS-11120. Try to fix robot balancer test
juncevich Jul 10, 2024
e7cd45b
HDDS-11120. TFix test + refactoring + modify balancer robot test
juncevich Jul 10, 2024
7c035be
HDDS-11120. Fix formatting.
juncevich Jul 10, 2024
b6ea9a5
HDDS-11120. Fix balancing test.
juncevich Jul 11, 2024
68b12b1
HDDS-11120. Fix review notices.
juncevich Jul 14, 2024
1538ee9
HDDS-11120. Fix review notices.
juncevich Jul 14, 2024
04bb57e
HDDS-11120. Fix testBalancer.
juncevich Jul 14, 2024
4a39571
HDDS-11120. Change -h command description.
juncevich Jul 15, 2024
2207557
HDDS-11120. Fix review notices.
juncevich Jul 17, 2024
daa7da4
HDDS-11120. Change test name.
juncevich Jul 17, 2024
9b51a85
HDDS-11120. Remove testBalancerStatus test.
juncevich Jul 17, 2024
ffafee0
HDDS-11120. Fix testBalancer test.
juncevich Jul 17, 2024
fe663c1
Empty commit message for restart pr checker
juncevich Jul 18, 2024
b59431b
HDDS-11120. Fix review notices.
juncevich Jul 22, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.DeletedBlocksTransactionInfo;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.scm.DatanodeAdminError;
import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerReplicaInfo;
Expand Down Expand Up @@ -384,6 +385,8 @@ StartContainerBalancerResponseProto startContainerBalancer(
*/
boolean getContainerBalancerStatus() throws IOException;

ContainerBalancerStatusInfoResponseProto getContainerBalancerStatusInfo() throws IOException;

/**
* returns the list of ratis peer roles. Currently only include peer address.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.DecommissionScmResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StartContainerBalancerResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.scm.DatanodeAdminError;
import org.apache.hadoop.hdds.scm.ScmConfig;
import org.apache.hadoop.hdds.scm.ScmInfo;
Expand Down Expand Up @@ -429,6 +430,8 @@ StartContainerBalancerResponseProto startContainerBalancer(
*/
boolean getContainerBalancerStatus() throws IOException;

ContainerBalancerStatusInfoResponseProto getContainerBalancerStatusInfo() throws IOException;

/**
* Get Datanode usage information by ip or hostname or uuid.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.StopContainerBalancerRequestProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ResetDeletedBlockRetryCountRequestProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.Type;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoResponseProto;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerLocationProtocolProtos.ContainerBalancerStatusInfoRequestProto;
import org.apache.hadoop.hdds.scm.DatanodeAdminError;
import org.apache.hadoop.hdds.scm.ScmInfo;
import org.apache.hadoop.hdds.scm.container.ContainerID;
Expand Down Expand Up @@ -1025,6 +1027,19 @@ public boolean getContainerBalancerStatus() throws IOException {

}

@Override
public ContainerBalancerStatusInfoResponseProto getContainerBalancerStatusInfo() throws IOException {

ContainerBalancerStatusInfoRequestProto request =
ContainerBalancerStatusInfoRequestProto.getDefaultInstance();
ContainerBalancerStatusInfoResponseProto response =
submitRequest(Type.GetContainerBalancerStatusInfo,
builder -> builder.setContainerBalancerStatusInfoRequest(request))
.getContainerBalancerStatusInfoResponse();
return response;

}

/**
* Builds request for datanode usage information and receives response.
*
Expand Down
35 changes: 35 additions & 0 deletions hadoop-hdds/interface-admin/src/main/proto/ScmAdminProtocol.proto
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ message ScmContainerLocationRequest {
optional SingleNodeQueryRequestProto singleNodeQueryRequest = 45;
optional GetContainersOnDecomNodeRequestProto getContainersOnDecomNodeRequest = 46;
optional GetMetricsRequestProto getMetricsRequest = 47;
optional ContainerBalancerStatusInfoRequestProto containerBalancerStatusInfoRequest = 48;
}

message ScmContainerLocationResponse {
Expand Down Expand Up @@ -139,6 +140,7 @@ message ScmContainerLocationResponse {
optional SingleNodeQueryResponseProto singleNodeQueryResponse = 45;
optional GetContainersOnDecomNodeResponseProto getContainersOnDecomNodeResponse = 46;
optional GetMetricsResponseProto getMetricsResponse = 47;
optional ContainerBalancerStatusInfoResponseProto containerBalancerStatusInfoResponse = 48;

enum Status {
OK = 1;
Expand Down Expand Up @@ -193,6 +195,7 @@ enum Type {
SingleNodeQuery = 41;
GetContainersOnDecomNode = 42;
GetMetrics = 43;
GetContainerBalancerStatusInfo = 44;
}

/**
Expand Down Expand Up @@ -607,6 +610,38 @@ message ContainerBalancerStatusResponseProto {
required bool isRunning = 1;
}

message ContainerBalancerStatusInfoRequestProto {
optional string traceID = 1;
}

message ContainerBalancerStatusInfoResponseProto {
required bool isRunning = 1;
juncevich marked this conversation as resolved.
Show resolved Hide resolved
optional ContainerBalancerStatusInfo containerBalancerStatusInfo = 2;
}
message ContainerBalancerStatusInfo {
required uint64 startedAt = 1;
required ContainerBalancerConfigurationProto configuration = 2;
repeated ContainerBalancerTaskIterationStatusInfo iterationsStatusInfo = 3;
}

message ContainerBalancerTaskIterationStatusInfo {
required int32 iterationNumber = 1;
optional string iterationResult = 2;
required int64 sizeScheduledForMove = 3;
required int64 dataSizeMovedGB = 4;
required int64 containerMovesScheduled = 5;
required int64 containerMovesCompleted = 6;
required int64 containerMovesFailed = 7;
required int64 containerMovesTimeout = 8;
repeated NodeTransferInfo sizeEnteringNodes = 9;
repeated NodeTransferInfo sizeLeavingNodes = 10;
}

message NodeTransferInfo {
required string uuid = 1;
required int64 dataVolume = 2;
}

message DecommissionScmRequestProto {
required string scmId = 1;
}
Expand Down
44 changes: 44 additions & 0 deletions hadoop-hdds/interface-admin/src/main/resources/proto.lock
juncevich marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@
{
"name": "DecommissionScm",
"integer": 40
},
{
"name": "GetContainerBalancerStatusInfo",
"integer": 44
}
]
},
Expand Down Expand Up @@ -497,6 +501,12 @@
"name": "decommissionScmRequest",
"type": "DecommissionScmRequestProto",
"optional": true
},
{
"id": 48,
"name": "containerBalancerStatusInfoRequest",
"type": "ContainerBalancerStatusInfoRequestProto",
"optional": true
}
]
},
Expand Down Expand Up @@ -772,6 +782,12 @@
"name": "decommissionScmResponse",
"type": "DecommissionScmResponseProto",
"optional": true
},
{
"id": 48,
"name": "containerBalancerStatusInfoResponse",
"type": "ContainerBalancerStatusInfoResponseProto",
"optional": true
}
]
},
Expand Down Expand Up @@ -1915,6 +1931,34 @@
"optional": true
}
]
},
{
"name": "ContainerBalancerStatusInfoRequestProto",
"fields": [
{
"id": 1,
"name": "traceID",
"type": "string",
"optional": true
}
]
},
{
"name": "ContainerBalancerStatusInfoResponseProto",
"fields": [
{
"id": 1,
"name": "isRunning",
"type": "bool",
"required": true
},
{
"id": 2,
"name": "containerBalancerStatusInfo",
"type": "ContainerBalancerStatusInfo",
"required": true
}
]
}
],
"services": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -279,4 +279,8 @@ NodeManager getNodeManager() {
return nodeManager;
}

@Override
public Map<DatanodeDetails, Long> getSizeEnteringNodes() {
return sizeEnteringNode;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.conf.StorageUnit;
import org.apache.hadoop.hdds.fs.DUFactory;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos.ContainerBalancerConfigurationProto;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
Expand All @@ -31,6 +32,7 @@
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.time.OffsetDateTime;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReentrantLock;

Expand All @@ -53,6 +55,7 @@ public class ContainerBalancer extends StatefulService {
private volatile Thread currentBalancingThread;
private volatile ContainerBalancerTask task = null;
private ReentrantLock lock;
private OffsetDateTime startedAt;

/**
* Constructs ContainerBalancer with the specified arguments. Initializes
Expand Down Expand Up @@ -175,6 +178,26 @@ public ContainerBalancerTask.Status getBalancerStatus() {
: ContainerBalancerTask.Status.STOPPED;
}

/**
* Get balancer status info.
*
* @return balancer status info if balancer started
*/
public ContainerBalancerStatusInfo getBalancerStatusInfo() throws IOException {
boolean isTaskRunning = task != null && task.getBalancerStatus() == ContainerBalancerTask.Status.RUNNING;
ivandika3 marked this conversation as resolved.
Show resolved Hide resolved
if (isTaskRunning) {
HddsProtos.ContainerBalancerConfigurationProto configProto =
readConfiguration(HddsProtos.ContainerBalancerConfigurationProto.class);
ivandika3 marked this conversation as resolved.
Show resolved Hide resolved
return new ContainerBalancerStatusInfo(
this.startedAt,
configProto,
task.getCurrentIterationsStatistic()
);
} else {
return null;
}

}
/**
* Checks if ContainerBalancer is in valid state to call stop.
*
Expand Down Expand Up @@ -204,6 +227,7 @@ public String getServiceName() {
@Override
public void start() throws IllegalContainerBalancerStateException,
InvalidContainerBalancerConfigurationException {
startedAt = OffsetDateTime.now();
lock.lock();
try {
// should be leader-ready, out of safe mode, and not running already
Expand Down Expand Up @@ -251,6 +275,7 @@ public void start() throws IllegalContainerBalancerStateException,
public void startBalancer(ContainerBalancerConfiguration configuration)
throws IllegalContainerBalancerStateException,
InvalidContainerBalancerConfigurationException, IOException {
startedAt = OffsetDateTime.now();
lock.lock();
try {
// validates state, config, and then saves config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ public String toString() {
excludeNodes.equals("") ? "None" : excludeNodes);
}

ContainerBalancerConfigurationProto.Builder toProtobufBuilder() {
public ContainerBalancerConfigurationProto.Builder toProtobufBuilder() {
ContainerBalancerConfigurationProto.Builder builder =
ContainerBalancerConfigurationProto.newBuilder();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ void incrementNumContainerMovesScheduledInLatestIteration(long valueToAdd) {
this.numContainerMovesScheduledInLatestIteration.incr(valueToAdd);
}

public void resetNumContainerMovesScheduledInLatestIteration() {
numContainerMovesScheduledInLatestIteration.incr(
-getNumContainerMovesScheduledInLatestIteration());
}

/**
* Gets the amount of data moved by Container Balancer in the latest
* iteration.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.hdds.scm.container.balancer;

import org.apache.hadoop.hdds.protocol.proto.HddsProtos;

import java.time.OffsetDateTime;
import java.util.List;

/**
* Info about balancer status.
*/
public class ContainerBalancerStatusInfo {
private final OffsetDateTime startedAt;
private final HddsProtos.ContainerBalancerConfigurationProto configuration;
private final List<ContainerBalancerTaskIterationStatusInfo> iterationsStatusInfo;

public ContainerBalancerStatusInfo(
OffsetDateTime startedAt,
HddsProtos.ContainerBalancerConfigurationProto configuration,
List<ContainerBalancerTaskIterationStatusInfo> iterationsStatusInfo) {
this.startedAt = startedAt;
this.configuration = configuration;
this.iterationsStatusInfo = iterationsStatusInfo;
}

public OffsetDateTime getStartedAt() {
return startedAt;
}

public HddsProtos.ContainerBalancerConfigurationProto getConfiguration() {
return configuration;
}

public List<ContainerBalancerTaskIterationStatusInfo> getIterationsStatusInfo() {
return iterationsStatusInfo;
}
}
Loading