Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HDDS-11462. Enhancing DataNode I/O Monitoring Capabilities. #7206

Merged
merged 3 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -401,4 +401,7 @@ private HddsConfigKeys() {
"hdds.datanode.slow.op.warning.threshold";
public static final String HDDS_DATANODE_SLOW_OP_WARNING_THRESHOLD_DEFAULT =
"500ms";

public static final String OZONE_DATANODE_IO_METRICS_PERCENTILES_INTERVALS_SECONDS_KEY =
"ozone.volume.io.percentiles.intervals.seconds";
}
12 changes: 12 additions & 0 deletions hadoop-hdds/common/src/main/resources/ozone-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4544,4 +4544,16 @@
maximum number of buckets across all volumes.
</description>
</property>

<property>
<name>ozone.volume.io.percentiles.intervals.seconds</name>
<value>60</value>
<tag>OZONE, DATANODE</tag>
<description>
This setting specifies the interval (in seconds) for monitoring percentile performance metrics.
It helps in tracking the read and write performance of DataNodes in real-time,
allowing for better identification and analysis of performance issues.
</description>
</property>

</configuration>
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,32 @@
*/
@InterfaceAudience.Private
public interface DNMXBean extends ServiceRuntimeInfo {

/**
* Gets the datanode hostname.
*
* @return the datanode hostname for the datanode.
*/
String getHostname();

/**
* Gets the client rpc port.
*
* @return the client rpc port
*/
String getClientRpcPort();

/**
* Gets the http port.
*
* @return the http port
*/
String getHttpPort();

/**
* Gets the https port.
*
* @return the http port
*/
String getHttpsPort();
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,53 @@
* This is the JMX management class for DN information.
*/
public class DNMXBeanImpl extends ServiceRuntimeInfoImpl implements DNMXBean {
public DNMXBeanImpl(
VersionInfo versionInfo) {

private String hostName;
private String clientRpcPort;
private String httpPort;
private String httpsPort;

public DNMXBeanImpl(VersionInfo versionInfo) {
super(versionInfo);
}

@Override
public String getHostname() {
return hostName;
}

@Override
public String getClientRpcPort() {
return clientRpcPort;
}

@Override
public String getHttpPort() {
return httpPort;
}

@Override
public String getHttpsPort() {
return httpsPort;
}

public void setHttpPort(String httpPort) {
this.httpPort = httpPort;
}

public void setHostName(String hostName) {
this.hostName = hostName;
}

public void setClientRpcPort(String rpcPort) {
this.clientRpcPort = rpcPort;
}

public String getHostName() {
return hostName;
}

public void setHttpsPort(String httpsPort) {
this.httpsPort = httpsPort;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ public String getNamespace() {
String ip = InetAddress.getByName(hostname).getHostAddress();
datanodeDetails = initializeDatanodeDetails();
datanodeDetails.setHostName(hostname);
serviceRuntimeInfo.setHostName(hostname);
datanodeDetails.setIpAddress(ip);
datanodeDetails.setVersion(
HddsVersionInfo.HDDS_VERSION_INFO.getVersion());
Expand Down Expand Up @@ -300,23 +301,30 @@ public String getNamespace() {
httpServer = new HddsDatanodeHttpServer(conf);
httpServer.start();
HttpConfig.Policy policy = HttpConfig.getHttpPolicy(conf);

if (policy.isHttpEnabled()) {
datanodeDetails.setPort(DatanodeDetails.newPort(HTTP,
httpServer.getHttpAddress().getPort()));
int httpPort = httpServer.getHttpAddress().getPort();
datanodeDetails.setPort(DatanodeDetails.newPort(HTTP, httpPort));
serviceRuntimeInfo.setHttpPort(String.valueOf(httpPort));
}

if (policy.isHttpsEnabled()) {
datanodeDetails.setPort(DatanodeDetails.newPort(HTTPS,
httpServer.getHttpsAddress().getPort()));
int httpsPort = httpServer.getHttpAddress().getPort();
datanodeDetails.setPort(DatanodeDetails.newPort(HTTPS, httpsPort));
serviceRuntimeInfo.setHttpsPort(String.valueOf(httpsPort));
}

} catch (Exception ex) {
LOG.error("HttpServer failed to start.", ex);
}


clientProtocolServer = new HddsDatanodeClientProtocolServer(
datanodeDetails, conf, HddsVersionInfo.HDDS_VERSION_INFO,
reconfigurationHandler);

int clientRpcport = clientProtocolServer.getClientRpcAddress().getPort();
serviceRuntimeInfo.setClientRpcPort(String.valueOf(clientRpcport));

// Get admin list
String starterUser =
UserGroupInformation.getCurrentUser().getShortUserName();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,21 @@ public Iterator<Container<?>> getContainerIterator(HddsVolume volume) {
.iterator();
}

/**
* Get the number of containers based on the given volume.
*
* @param volume hdds volume.
* @return number of containers
*/
public long containerCount(HddsVolume volume) {
Preconditions.checkNotNull(volume);
Preconditions.checkNotNull(volume.getStorageID());
String volumeUuid = volume.getStorageID();
return containerMap.values().stream()
.filter(x -> volumeUuid.equals(x.getContainerData().getVolume()
.getStorageID())).count();
}

/**
* Return an containerMap iterator over {@link ContainerSet#containerMap}.
* @return containerMap Iterator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,15 @@
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.hdds.annotation.InterfaceAudience;
import org.apache.hadoop.hdds.annotation.InterfaceStability;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.upgrade.HDDSLayoutFeature;
import org.apache.hadoop.hdfs.server.datanode.checker.VolumeCheckResult;
import org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
import org.apache.hadoop.ozone.container.common.utils.DatanodeStoreCache;
import org.apache.hadoop.ozone.container.common.utils.HddsVolumeUtil;
import org.apache.hadoop.ozone.container.common.utils.RawDB;
import org.apache.hadoop.ozone.container.common.utils.StorageVolumeUtil;
import org.apache.hadoop.ozone.container.ozoneimpl.ContainerController;
import org.apache.hadoop.ozone.container.upgrade.VersionedDatanodeFeatures;
import org.apache.hadoop.ozone.container.upgrade.VersionedDatanodeFeatures.SchemaV3;
import org.apache.hadoop.util.Time;
Expand All @@ -44,6 +46,7 @@

import jakarta.annotation.Nullable;

import static org.apache.hadoop.hdds.HddsConfigKeys.OZONE_DATANODE_IO_METRICS_PERCENTILES_INTERVALS_SECONDS_KEY;
import static org.apache.hadoop.ozone.OzoneConsts.CONTAINER_DB_NAME;
import static org.apache.hadoop.ozone.container.common.utils.HddsVolumeUtil.initPerDiskDBStore;

Expand Down Expand Up @@ -80,6 +83,8 @@ public class HddsVolume extends StorageVolume {
private final VolumeIOStats volumeIOStats;
private final VolumeInfoMetrics volumeInfoMetrics;

private ContainerController controller;

private final AtomicLong committedBytes = new AtomicLong(); // till Open containers become full

// Mentions the type of volume
Expand Down Expand Up @@ -119,8 +124,10 @@ private HddsVolume(Builder b) throws IOException {

if (!b.getFailedVolume() && getVolumeInfo().isPresent()) {
this.setState(VolumeState.NOT_INITIALIZED);
ConfigurationSource conf = getConf();
int[] intervals = conf.getInts(OZONE_DATANODE_IO_METRICS_PERCENTILES_INTERVALS_SECONDS_KEY);
this.volumeIOStats = new VolumeIOStats(b.getVolumeRootStr(),
this.getStorageDir().toString());
this.getStorageDir().toString(), intervals);
this.volumeInfoMetrics =
new VolumeInfoMetrics(b.getVolumeRootStr(), this);

Expand Down Expand Up @@ -382,6 +389,17 @@ public void loadDbStore(boolean readOnly) throws IOException {
getStorageID());
}

public void setController(ContainerController controller) {
this.controller = controller;
}

public long getContainers() {
if (controller != null) {
return controller.getContainerCount(this);
}
return 0;
}

/**
* Pick a DbVolume for HddsVolume and init db instance.
* Use the HddsVolume directly if no DbVolume found.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,34 @@
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.apache.hadoop.metrics2.lib.MutableQuantiles;
import org.apache.hadoop.metrics2.lib.MutableRate;

/**
* This class is used to track Volume IO stats for each HDDS Volume.
*/
public class VolumeIOStats {
private String metricsSourceName = VolumeIOStats.class.getSimpleName();
private String storageDirectory;
private @Metric MutableCounterLong readBytes;
private @Metric MutableCounterLong readOpCount;
private @Metric MutableCounterLong writeBytes;
private @Metric MutableCounterLong writeOpCount;
private @Metric MutableCounterLong readTime;
private @Metric MutableCounterLong writeTime;
private final MetricsRegistry registry = new MetricsRegistry("VolumeIOStats");
@Metric
private MutableCounterLong readBytes;
@Metric
private MutableCounterLong readOpCount;
@Metric
private MutableCounterLong writeBytes;
@Metric
private MutableCounterLong writeOpCount;
@Metric
private MutableRate readTime;
@Metric
private MutableQuantiles[] readLatencyQuantiles;
@Metric
private MutableRate writeTime;
@Metric
private MutableQuantiles[] writeLatencyQuantiles;

@Deprecated
public VolumeIOStats() {
Expand All @@ -44,9 +58,24 @@ public VolumeIOStats() {
/**
* @param identifier Typically, path to volume root. e.g. /data/hdds
*/
public VolumeIOStats(String identifier, String storageDirectory) {
public VolumeIOStats(String identifier, String storageDirectory, int[] intervals) {
this.metricsSourceName += '-' + identifier;
this.storageDirectory = storageDirectory;

// Try initializing `readLatencyQuantiles` and `writeLatencyQuantiles`
if (intervals != null && intervals.length > 0) {
final int length = intervals.length;
readLatencyQuantiles = new MutableQuantiles[intervals.length];
writeLatencyQuantiles = new MutableQuantiles[intervals.length];
for (int i = 0; i < length; i++) {
readLatencyQuantiles[i] = registry.newQuantiles(
"readLatency" + intervals[i] + "s",
"Read Data File Io Latency in ms", "ops", "latency", intervals[i]);
writeLatencyQuantiles[i] = registry.newQuantiles(
"writeLatency" + intervals[i] + "s",
"Write Data File Io Latency in ms", "ops", "latency", intervals[i]);
}
}
init();
}

Expand Down Expand Up @@ -99,15 +128,21 @@ public void incWriteOpCount() {
* @param time
*/
public void incReadTime(long time) {
readTime.incr(time);
readTime.add(time);
for (MutableQuantiles q : readLatencyQuantiles) {
q.add(time);
}
}

/**
* Increment the time taken by write operation on the volume.
* @param time
*/
public void incWriteTime(long time) {
writeTime.incr(time);
writeTime.add(time);
for (MutableQuantiles q : writeLatencyQuantiles) {
q.add(time);
}
}

/**
Expand Down Expand Up @@ -147,15 +182,15 @@ public long getWriteOpCount() {
* @return long
*/
public long getReadTime() {
return readTime.value();
return (long) readTime.lastStat().total();
}

/**
* Returns total write operations time on the volume.
* @return long
*/
public long getWriteTime() {
return writeTime.value();
return (long) writeTime.lastStat().total();
}

@Metric
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class VolumeInfoMetrics {
private final HddsVolume volume;
@Metric("Returns the RocksDB compact times of the Volume")
private MutableRate dbCompactLatency;
private long containers;

/**
* @param identifier Typically, path to volume root. E.g. /data/hdds
Expand Down Expand Up @@ -153,4 +154,11 @@ public void dbCompactTimesNanoSecondsIncr(long time) {
dbCompactLatency.add(time);
}

/**
* Return the Container Count of the Volume.
*/
@Metric("Returns the Container Count of the Volume")
public long getContainers() {
return volume.getContainers();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ public BackgroundContainerDataScanner(ContainerScannerConfiguration conf,
throttler = new HddsDataTransferThrottler(conf.getBandwidthPerVolume());
canceler = new Canceler();
this.metrics = ContainerDataScannerMetrics.create(volume.toString());
this.metrics.setStorageDirectory(volume.toString());
this.minScanGap = conf.getContainerScanMinGap();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,16 @@ public Iterator<Container<?>> getContainers(HddsVolume volume) {
return containerSet.getContainerIterator(volume);
}

/**
* Get the number of containers based on the given volume.
*
* @param volume hdds volume.
* @return number of containers.
*/
public long getContainerCount(HddsVolume volume) {
return containerSet.containerCount(volume);
}

void updateDataScanTimestamp(long containerId, Instant timestamp)
throws IOException {
Container container = containerSet.getContainer(containerId);
Expand Down
Loading