Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,36 @@ Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp

Usually used to troubleshoot network problems.

### `doris_fe_meminfo{name="memory_total"}`

Value of the `MemTotal` field in `/proc/meminfo`. Represents the size of all available memory, total physical memory minus reserved space and kernel size.

Usually used to troubleshoot memory problems.

### `doris_fe_meminfo{name="memory_free"}`

Value of the `MemFree` field in `/proc/meminfo`. Represents the size of unused memory in system.

Usually used to troubleshoot memory problems.

### `doris_fe_meminfo{name="memory_available"}`

Value of the `MemAvailable` field in `/proc/meminfo`. Represents the real system usable memory size. Although some memory in the system has been used, but it can be reclaimed. So this part of reclaimable memory plus MemFree is the system usable memory.

Usually used to troubleshoot memory problems.

### `doris_fe_meminfo{name="buffers"}`

Value of the `Buffers` field in `/proc/meminfo`. Represents the memory used to cache the block device (metadata, pages of the file system).

Usually used to troubleshoot memory problems.

### `doris_fe_meminfo{name="cached"}`

Value of the `Cached` field in `/proc/meminfo`. Represents the memory allocated to the file cache.

Usually used to troubleshoot memory problems.

### `jvm_thread{type="count"}`

Value of the `count` type in `jvm_thread`. Represents the current number of live threads including both daemon and non-daemon threads.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,36 @@ FE 的监控项可以通过以下方式访问:

通常用于排查网络问题。

### `doris_fe_meminfo{name="memory_total"}`

该监控项为 `/proc/meminfo` 中的 `MemTotal` 字段值。表示所有可用的内存大小,总的物理内存减去预留空间和内核大小。

通常用于排查内存问题。

### `doris_fe_meminfo{name="memory_free"}`

该监控项为 `/proc/meminfo` 中的 `MemFree` 字段值。表示系统尚未使用的内存。。

通常用于排查内存问题。

### `doris_fe_meminfo{name="memory_available"}`

该监控项为 `/proc/meminfo` 中的 `MemAvailable` 字段值。真正的系统可用内存,系统中有些内存虽然已被使用但是可以回收的,所以这部分可回收的内存加上MemFree才是系统可用的内存

通常用于排查内存问题。

### `doris_fe_meminfo{name="buffers"}`

该监控项为 `/proc/meminfo` 中的 `Buffers` 字段值。表示用来给块设备做缓存的内存(文件系统的metadata、pages)。

通常用于排查内存问题。

### `doris_fe_meminfo{name="cached"}`

该监控项为 `/proc/meminfo` 中的 `Cached` 字段值。表示分配给文件缓冲区的内存。

通常用于排查内存问题。

### `jvm_thread{type="count"}`

该监控项表示FE节点当前JVM总的线程数量,包含daemon线程和非daemon线程。
Expand Down
55 changes: 55 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,61 @@ public Long getValue() {
};
tpcOutSegs.addLabel(new MetricLabel("name", "tcp_out_segs"));
PALO_METRIC_REGISTER.addPaloMetrics(tpcOutSegs);

// Memory Total
GaugeMetric<Long> memTotal = (GaugeMetric<Long>) new GaugeMetric<Long>(
"meminfo", MetricUnit.BYTES, "Total usable memory") {
@Override
public Long getValue() {
return SYSTEM_METRICS.memTotal;
}
};
memTotal.addLabel(new MetricLabel("name", "memory_total"));
PALO_METRIC_REGISTER.addPaloMetrics(memTotal);

// Memory Free
GaugeMetric<Long> memFree = (GaugeMetric<Long>) new GaugeMetric<Long>(
"meminfo", MetricUnit.BYTES, "The amount of physical memory not used by the system") {
@Override
public Long getValue() {
return SYSTEM_METRICS.memFree;
}
};
memFree.addLabel(new MetricLabel("name", "memory_free"));
PALO_METRIC_REGISTER.addPaloMetrics(memFree);

// Memory Total
GaugeMetric<Long> memAvailable = (GaugeMetric<Long>) new GaugeMetric<Long>(
"meminfo", MetricUnit.BYTES, "An estimate of how much memory is available for starting new applications, without swapping") {
@Override
public Long getValue() {
return SYSTEM_METRICS.memAvailable;
}
};
memAvailable.addLabel(new MetricLabel("name", "memory_available"));
PALO_METRIC_REGISTER.addPaloMetrics(memAvailable);

// Buffers
GaugeMetric<Long> buffers = (GaugeMetric<Long>) new GaugeMetric<Long>(
"meminfo", MetricUnit.BYTES, "Memory in buffer cache, so relatively temporary storage for raw disk blocks") {
@Override
public Long getValue() {
return SYSTEM_METRICS.buffers;
}
};
buffers.addLabel(new MetricLabel("name", "buffers"));
PALO_METRIC_REGISTER.addPaloMetrics(buffers);

// Cached
GaugeMetric<Long> cached = (GaugeMetric<Long>) new GaugeMetric<Long>(
"meminfo", MetricUnit.BYTES, "Memory in the pagecache (Diskcache and Shared Memory)") {
@Override
public Long getValue() {
return SYSTEM_METRICS.cached;
}
};
cached.addLabel(new MetricLabel("name", "cached"));
PALO_METRIC_REGISTER.addPaloMetrics(cached);
}

// to generate the metrics related to tablets of each backends
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,20 @@ public class SystemMetrics {
protected long tcpInSegs = 0;
// All send TCP packets with RST mark
protected long tcpOutSegs = 0;
// Total usable memory
protected long memTotal = 0;
// The amount of physical memory not used by the system
protected long memFree = 0;
// An estimate of how much memory is available for starting new applications, without swapping
protected long memAvailable = 0;
// Memory in buffer cache, so relatively temporary storage for raw disk blocks
protected long buffers = 0;
// Memory in the pagecache (Diskcache and Shared Memory)
protected long cached = 0;

public synchronized void update() {
updateSnmpMetrics();
updateMemoryMetrics();
}

private void updateSnmpMetrics() {
Expand Down Expand Up @@ -81,7 +92,7 @@ private void updateSnmpMetrics() {
if ((line = br.readLine()) == null) {
throw new Exception("failed to read metrics of TCP");
}

// eg: Tcp: 1 200 120000 -1 38920626 10487279 105581903 300009 305 18079291213 15411998945 11808180 22905 4174570 0
String[] parts = line.split(" ");
if (parts.length != headerMap.size()) {
Expand All @@ -98,4 +109,40 @@ private void updateSnmpMetrics() {
}
}

private void updateMemoryMetrics() {
String procFile = "/proc/meminfo";
String[] memoryMetrics = {"MemTotal", "MemFree", "MemAvailable", "Buffers", "Cached"};
Map<String, Long> memInfoMap = Maps.newHashMap();

try (FileReader fileReader = new FileReader(procFile);
BufferedReader br = new BufferedReader(fileReader)) {
String[] parts;
String line = null;
while ((line = br.readLine()) != null) {
for (String memoryMetric : memoryMetrics) {
if (!memInfoMap.containsKey(memoryMetric) && line.startsWith(memoryMetric)) {
parts = line.split("\\s+");
if (parts.length != 3) {
throw new Exception("invalid memory metrics: " + line);
} else {
memInfoMap.put(memoryMetric, new Long(parts[1]) * 1024);
break;
}
}
}
if (memInfoMap.size() == memoryMetrics.length) {
break;
}
}
// if can not get metrics from /proc/meminfo, we will set -1 as default value
memTotal = memInfoMap.getOrDefault("MemTotal", -1L);
memFree = memInfoMap.getOrDefault("MemFree", -1L);
memAvailable = memInfoMap.getOrDefault("MemAvailable", -1L);
buffers = memInfoMap.getOrDefault("Buffers", -1L);
cached = memInfoMap.getOrDefault("Cached", -1L);
} catch (Exception e) {
LOG.warn("failed to get /proc/meminfo", e);
}
}

}