diff --git a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md index 4c4e3f7dfd1735..8ad4381c8a394d 100644 --- a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md +++ b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md @@ -76,6 +76,36 @@ Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp Usually used to troubleshoot network problems. +### `doris_fe_meminfo{name="memory_total"}` + +Value of the `MemTotal` field in `/proc/meminfo`. Represents the size of all available memory, total physical memory minus reserved space and kernel size. + +Usually used to troubleshoot memory problems. + +### `doris_fe_meminfo{name="memory_free"}` + +Value of the `MemFree` field in `/proc/meminfo`. Represents the size of unused memory in system. + +Usually used to troubleshoot memory problems. + +### `doris_fe_meminfo{name="memory_available"}` + +Value of the `MemAvailable` field in `/proc/meminfo`. Represents the real system usable memory size. Although some memory in the system has been used, but it can be reclaimed. So this part of reclaimable memory plus MemFree is the system usable memory. + +Usually used to troubleshoot memory problems. + +### `doris_fe_meminfo{name="buffers"}` + +Value of the `Buffers` field in `/proc/meminfo`. Represents the memory used to cache the block device (metadata, pages of the file system). + +Usually used to troubleshoot memory problems. + +### `doris_fe_meminfo{name="cached"}` + +Value of the `Cached` field in `/proc/meminfo`. Represents the memory allocated to the file cache. + +Usually used to troubleshoot memory problems. + ### `jvm_thread{type="count"}` Value of the `count` type in `jvm_thread`. Represents the current number of live threads including both daemon and non-daemon threads. diff --git a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md index 8b58e9f53e9adb..22772964380636 100644 --- a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md +++ b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md @@ -76,6 +76,36 @@ FE 的监控项可以通过以下方式访问: 通常用于排查网络问题。 +### `doris_fe_meminfo{name="memory_total"}` + +该监控项为 `/proc/meminfo` 中的 `MemTotal` 字段值。表示所有可用的内存大小,总的物理内存减去预留空间和内核大小。 + +通常用于排查内存问题。 + +### `doris_fe_meminfo{name="memory_free"}` + +该监控项为 `/proc/meminfo` 中的 `MemFree` 字段值。表示系统尚未使用的内存。。 + +通常用于排查内存问题。 + +### `doris_fe_meminfo{name="memory_available"}` + +该监控项为 `/proc/meminfo` 中的 `MemAvailable` 字段值。真正的系统可用内存,系统中有些内存虽然已被使用但是可以回收的,所以这部分可回收的内存加上MemFree才是系统可用的内存 + +通常用于排查内存问题。 + +### `doris_fe_meminfo{name="buffers"}` + +该监控项为 `/proc/meminfo` 中的 `Buffers` 字段值。表示用来给块设备做缓存的内存(文件系统的metadata、pages)。 + +通常用于排查内存问题。 + +### `doris_fe_meminfo{name="cached"}` + +该监控项为 `/proc/meminfo` 中的 `Cached` 字段值。表示分配给文件缓冲区的内存。 + +通常用于排查内存问题。 + ### `jvm_thread{type="count"}` 该监控项表示FE节点当前JVM总的线程数量,包含daemon线程和非daemon线程。 diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java index 574fc891d71e23..0fddec811e75ba 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java @@ -328,6 +328,61 @@ public Long getValue() { }; tpcOutSegs.addLabel(new MetricLabel("name", "tcp_out_segs")); PALO_METRIC_REGISTER.addPaloMetrics(tpcOutSegs); + + // Memory Total + GaugeMetric memTotal = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "Total usable memory") { + @Override + public Long getValue() { + return SYSTEM_METRICS.memTotal; + } + }; + memTotal.addLabel(new MetricLabel("name", "memory_total")); + PALO_METRIC_REGISTER.addPaloMetrics(memTotal); + + // Memory Free + GaugeMetric memFree = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "The amount of physical memory not used by the system") { + @Override + public Long getValue() { + return SYSTEM_METRICS.memFree; + } + }; + memFree.addLabel(new MetricLabel("name", "memory_free")); + PALO_METRIC_REGISTER.addPaloMetrics(memFree); + + // Memory Total + GaugeMetric memAvailable = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "An estimate of how much memory is available for starting new applications, without swapping") { + @Override + public Long getValue() { + return SYSTEM_METRICS.memAvailable; + } + }; + memAvailable.addLabel(new MetricLabel("name", "memory_available")); + PALO_METRIC_REGISTER.addPaloMetrics(memAvailable); + + // Buffers + GaugeMetric buffers = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "Memory in buffer cache, so relatively temporary storage for raw disk blocks") { + @Override + public Long getValue() { + return SYSTEM_METRICS.buffers; + } + }; + buffers.addLabel(new MetricLabel("name", "buffers")); + PALO_METRIC_REGISTER.addPaloMetrics(buffers); + + // Cached + GaugeMetric cached = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "Memory in the pagecache (Diskcache and Shared Memory)") { + @Override + public Long getValue() { + return SYSTEM_METRICS.cached; + } + }; + cached.addLabel(new MetricLabel("name", "cached")); + PALO_METRIC_REGISTER.addPaloMetrics(cached); } // to generate the metrics related to tablets of each backends diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/SystemMetrics.java b/fe/fe-core/src/main/java/org/apache/doris/metric/SystemMetrics.java index 15b221db2c6294..8b13ba6b9fb604 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/SystemMetrics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/SystemMetrics.java @@ -45,9 +45,20 @@ public class SystemMetrics { protected long tcpInSegs = 0; // All send TCP packets with RST mark protected long tcpOutSegs = 0; + // Total usable memory + protected long memTotal = 0; + // The amount of physical memory not used by the system + protected long memFree = 0; + // An estimate of how much memory is available for starting new applications, without swapping + protected long memAvailable = 0; + // Memory in buffer cache, so relatively temporary storage for raw disk blocks + protected long buffers = 0; + // Memory in the pagecache (Diskcache and Shared Memory) + protected long cached = 0; public synchronized void update() { updateSnmpMetrics(); + updateMemoryMetrics(); } private void updateSnmpMetrics() { @@ -81,7 +92,7 @@ private void updateSnmpMetrics() { if ((line = br.readLine()) == null) { throw new Exception("failed to read metrics of TCP"); } - + // eg: Tcp: 1 200 120000 -1 38920626 10487279 105581903 300009 305 18079291213 15411998945 11808180 22905 4174570 0 String[] parts = line.split(" "); if (parts.length != headerMap.size()) { @@ -98,4 +109,40 @@ private void updateSnmpMetrics() { } } + private void updateMemoryMetrics() { + String procFile = "/proc/meminfo"; + String[] memoryMetrics = {"MemTotal", "MemFree", "MemAvailable", "Buffers", "Cached"}; + Map memInfoMap = Maps.newHashMap(); + + try (FileReader fileReader = new FileReader(procFile); + BufferedReader br = new BufferedReader(fileReader)) { + String[] parts; + String line = null; + while ((line = br.readLine()) != null) { + for (String memoryMetric : memoryMetrics) { + if (!memInfoMap.containsKey(memoryMetric) && line.startsWith(memoryMetric)) { + parts = line.split("\\s+"); + if (parts.length != 3) { + throw new Exception("invalid memory metrics: " + line); + } else { + memInfoMap.put(memoryMetric, new Long(parts[1]) * 1024); + break; + } + } + } + if (memInfoMap.size() == memoryMetrics.length) { + break; + } + } + // if can not get metrics from /proc/meminfo, we will set -1 as default value + memTotal = memInfoMap.getOrDefault("MemTotal", -1L); + memFree = memInfoMap.getOrDefault("MemFree", -1L); + memAvailable = memInfoMap.getOrDefault("MemAvailable", -1L); + buffers = memInfoMap.getOrDefault("Buffers", -1L); + cached = memInfoMap.getOrDefault("Cached", -1L); + } catch (Exception e) { + LOG.warn("failed to get /proc/meminfo", e); + } + } + }