From cd078eef0dbe65c0c76c086fd3adf2e063c780a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=9C=E5=AE=89=E6=98=8E?= Date: Fri, 25 Dec 2020 10:38:47 +0800 Subject: [PATCH] add system memory metrics for fe --- .../operation/monitor-metrics/fe-metrics.md | 30 ++++++++++ .../operation/monitor-metrics/fe-metrics.md | 30 ++++++++++ .../org/apache/doris/metric/MetricRepo.java | 55 +++++++++++++++++++ .../apache/doris/metric/SystemMetrics.java | 49 ++++++++++++++++- 4 files changed, 163 insertions(+), 1 deletion(-) diff --git a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md index 7a2afae78e4796..cf0e5801b6d8e5 100644 --- a/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md +++ b/docs/en/administrator-guide/operation/monitor-metrics/fe-metrics.md @@ -75,3 +75,33 @@ Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets. Usually used to troubleshoot network problems. + +### `doris_fe_meminfo{name="memory_total"}` + +Value of the `MemTotal` field in `/proc/meminfo`. Represents the size of all available memory, total physical memory minus reserved space and kernel size. + +Usually used to troubleshoot memory problems. + +### `doris_fe_meminfo{name="memory_free"}` + +Value of the `MemFree` field in `/proc/meminfo`. Represents the size of unused memory in system. + +Usually used to troubleshoot memory problems. + +### `doris_fe_meminfo{name="memory_available"}` + +Value of the `MemAvailable` field in `/proc/meminfo`. Represents the real system usable memory size. Although some memory in the system has been used, but it can be reclaimed. So this part of reclaimable memory plus MemFree is the system usable memory. + +Usually used to troubleshoot memory problems. + +### `doris_fe_meminfo{name="buffers"}` + +Value of the `Buffers` field in `/proc/meminfo`. Represents the memory used to cache the block device (metadata, pages of the file system). + +Usually used to troubleshoot memory problems. + +### `doris_fe_meminfo{name="cached"}` + +Value of the `Cached` field in `/proc/meminfo`. Represents the memory allocated to the file cache. + +Usually used to troubleshoot memory problems. diff --git a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md index aaa18541047d0b..f245abf5d41597 100644 --- a/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md +++ b/docs/zh-CN/administrator-guide/operation/monitor-metrics/fe-metrics.md @@ -75,3 +75,33 @@ FE 的监控项可以通过以下方式访问: 通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。 通常用于排查网络问题。 + +### `doris_fe_meminfo{name="memory_total"}` + +该监控项为 `/proc/meminfo` 中的 `MemTotal` 字段值。表示所有可用的内存大小,总的物理内存减去预留空间和内核大小。 + +通常用于排查内存问题。 + +### `doris_fe_meminfo{name="memory_free"}` + +该监控项为 `/proc/meminfo` 中的 `MemFree` 字段值。表示系统尚未使用的内存。。 + +通常用于排查内存问题。 + +### `doris_fe_meminfo{name="memory_available"}` + +该监控项为 `/proc/meminfo` 中的 `MemAvailable` 字段值。真正的系统可用内存,系统中有些内存虽然已被使用但是可以回收的,所以这部分可回收的内存加上MemFree才是系统可用的内存 + +通常用于排查内存问题。 + +### `doris_fe_meminfo{name="buffers"}` + +该监控项为 `/proc/meminfo` 中的 `Buffers` 字段值。表示用来给块设备做缓存的内存(文件系统的metadata、pages)。 + +通常用于排查内存问题。 + +### `doris_fe_meminfo{name="cached"}` + +该监控项为 `/proc/meminfo` 中的 `Cached` 字段值。表示分配给文件缓冲区的内存。 + +通常用于排查内存问题。 diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java index 574fc891d71e23..0fddec811e75ba 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/MetricRepo.java @@ -328,6 +328,61 @@ public Long getValue() { }; tpcOutSegs.addLabel(new MetricLabel("name", "tcp_out_segs")); PALO_METRIC_REGISTER.addPaloMetrics(tpcOutSegs); + + // Memory Total + GaugeMetric memTotal = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "Total usable memory") { + @Override + public Long getValue() { + return SYSTEM_METRICS.memTotal; + } + }; + memTotal.addLabel(new MetricLabel("name", "memory_total")); + PALO_METRIC_REGISTER.addPaloMetrics(memTotal); + + // Memory Free + GaugeMetric memFree = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "The amount of physical memory not used by the system") { + @Override + public Long getValue() { + return SYSTEM_METRICS.memFree; + } + }; + memFree.addLabel(new MetricLabel("name", "memory_free")); + PALO_METRIC_REGISTER.addPaloMetrics(memFree); + + // Memory Total + GaugeMetric memAvailable = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "An estimate of how much memory is available for starting new applications, without swapping") { + @Override + public Long getValue() { + return SYSTEM_METRICS.memAvailable; + } + }; + memAvailable.addLabel(new MetricLabel("name", "memory_available")); + PALO_METRIC_REGISTER.addPaloMetrics(memAvailable); + + // Buffers + GaugeMetric buffers = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "Memory in buffer cache, so relatively temporary storage for raw disk blocks") { + @Override + public Long getValue() { + return SYSTEM_METRICS.buffers; + } + }; + buffers.addLabel(new MetricLabel("name", "buffers")); + PALO_METRIC_REGISTER.addPaloMetrics(buffers); + + // Cached + GaugeMetric cached = (GaugeMetric) new GaugeMetric( + "meminfo", MetricUnit.BYTES, "Memory in the pagecache (Diskcache and Shared Memory)") { + @Override + public Long getValue() { + return SYSTEM_METRICS.cached; + } + }; + cached.addLabel(new MetricLabel("name", "cached")); + PALO_METRIC_REGISTER.addPaloMetrics(cached); } // to generate the metrics related to tablets of each backends diff --git a/fe/fe-core/src/main/java/org/apache/doris/metric/SystemMetrics.java b/fe/fe-core/src/main/java/org/apache/doris/metric/SystemMetrics.java index 15b221db2c6294..8b13ba6b9fb604 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/metric/SystemMetrics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/metric/SystemMetrics.java @@ -45,9 +45,20 @@ public class SystemMetrics { protected long tcpInSegs = 0; // All send TCP packets with RST mark protected long tcpOutSegs = 0; + // Total usable memory + protected long memTotal = 0; + // The amount of physical memory not used by the system + protected long memFree = 0; + // An estimate of how much memory is available for starting new applications, without swapping + protected long memAvailable = 0; + // Memory in buffer cache, so relatively temporary storage for raw disk blocks + protected long buffers = 0; + // Memory in the pagecache (Diskcache and Shared Memory) + protected long cached = 0; public synchronized void update() { updateSnmpMetrics(); + updateMemoryMetrics(); } private void updateSnmpMetrics() { @@ -81,7 +92,7 @@ private void updateSnmpMetrics() { if ((line = br.readLine()) == null) { throw new Exception("failed to read metrics of TCP"); } - + // eg: Tcp: 1 200 120000 -1 38920626 10487279 105581903 300009 305 18079291213 15411998945 11808180 22905 4174570 0 String[] parts = line.split(" "); if (parts.length != headerMap.size()) { @@ -98,4 +109,40 @@ private void updateSnmpMetrics() { } } + private void updateMemoryMetrics() { + String procFile = "/proc/meminfo"; + String[] memoryMetrics = {"MemTotal", "MemFree", "MemAvailable", "Buffers", "Cached"}; + Map memInfoMap = Maps.newHashMap(); + + try (FileReader fileReader = new FileReader(procFile); + BufferedReader br = new BufferedReader(fileReader)) { + String[] parts; + String line = null; + while ((line = br.readLine()) != null) { + for (String memoryMetric : memoryMetrics) { + if (!memInfoMap.containsKey(memoryMetric) && line.startsWith(memoryMetric)) { + parts = line.split("\\s+"); + if (parts.length != 3) { + throw new Exception("invalid memory metrics: " + line); + } else { + memInfoMap.put(memoryMetric, new Long(parts[1]) * 1024); + break; + } + } + } + if (memInfoMap.size() == memoryMetrics.length) { + break; + } + } + // if can not get metrics from /proc/meminfo, we will set -1 as default value + memTotal = memInfoMap.getOrDefault("MemTotal", -1L); + memFree = memInfoMap.getOrDefault("MemFree", -1L); + memAvailable = memInfoMap.getOrDefault("MemAvailable", -1L); + buffers = memInfoMap.getOrDefault("Buffers", -1L); + cached = memInfoMap.getOrDefault("Cached", -1L); + } catch (Exception e) { + LOG.warn("failed to get /proc/meminfo", e); + } + } + }