Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 29 additions & 8 deletions be/src/util/system_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
// under the License.

#include "util/system_metrics.h"
#include "gutil/strings/split.h" // for string split
#include "gutil/strtoint.h" // for atoi64

#include <stdio.h>
#include <gperftools/malloc_extension.h>
Expand Down Expand Up @@ -72,6 +74,10 @@ struct SnmpMetrics {
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_errs, MetricUnit::NOUNIT);
// All TCP packets retransmitted
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_retrans_segs, MetricUnit::NOUNIT);
// All received TCP packets
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_segs, MetricUnit::NOUNIT);
// All send TCP packets with RST mark
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_out_segs, MetricUnit::NOUNIT);
};

struct FileDescriptorMetrics {
Expand Down Expand Up @@ -323,6 +329,8 @@ void SystemMetrics::_install_snmp_metrics(MetricRegistry* registry) {
&_snmp_metrics->name)
REGISTER_SNMP_METRIC(tcp_in_errs);
REGISTER_SNMP_METRIC(tcp_retrans_segs);
REGISTER_SNMP_METRIC(tcp_in_segs);
REGISTER_SNMP_METRIC(tcp_out_segs);
}

void SystemMetrics::_update_net_metrics() {
Expand Down Expand Up @@ -449,8 +457,16 @@ void SystemMetrics::_update_snmp_metrics() {
return;
}

// skip the Tcp header line
// parse the Tcp header
// Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
std::vector<std::string> headers = strings::Split(_line_ptr, " ");
std::unordered_map<std::string, int32_t> header_map;
int32_t pos = 0;
for (auto& h : headers) {
header_map.emplace(h, pos++);
}

// read the metrics of TCP
if (getline(&_line_ptr, &_line_buf_size, fp) < 0) {
char buf[64];
LOG(WARNING) << "failed to skip Tcp header line of /proc/net/snmp, errno=" << errno
Expand All @@ -461,15 +477,20 @@ void SystemMetrics::_update_snmp_metrics() {

// metric line looks like:
// Tcp: 1 200 120000 -1 47849374 38601877 3353843 2320314 276 1033354613 1166025166 825439 12694 23238924 0
int64_t retrans_segs = 0;
int64_t in_errs = 0;
sscanf(_line_ptr,
"Tcp: %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d"
" %" PRId64 " %" PRId64 " %*d %*d",
&retrans_segs, &in_errs);

std::vector<std::string> metrics = strings::Split(_line_ptr, " ");
if (metrics.size() != headers.size()) {
LOG(WARNING) << "invalid tcp metrics line: " << _line_ptr;
fclose(fp);
return;
}
int64_t retrans_segs = atoi64(metrics[header_map["RetransSegs"]]);
int64_t in_errs = atoi64(metrics[header_map["InErrs"]]);
int64_t in_segs = atoi64(metrics[header_map["InSegs"]]);
int64_t out_segs = atoi64(metrics[header_map["OutSegs"]]);
_snmp_metrics->tcp_retrans_segs.set_value(retrans_segs);
_snmp_metrics->tcp_in_errs.set_value(in_errs);
_snmp_metrics->tcp_in_segs.set_value(in_segs);
_snmp_metrics->tcp_out_segs.set_value(out_segs);

if (ferror(fp) != 0) {
char buf[64];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,19 @@ Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number
The incidence rate can be calculated in combination with the sampling period.

Usually used to troubleshoot network problems.

### `doris_be_snmp{name="tcp_in_segs"}`

Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of receivied TCP packets.

Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets.

Usually used to troubleshoot network problems.

### `doris_be_snmp{name="tcp_out_segs"}`

Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark.

Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets.

Usually used to troubleshoot network problems.
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,19 @@ Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number
The incidence rate can be calculated in combination with the sampling period.

Usually used to troubleshoot network problems.

### `doris_fe_snmp{name="tcp_in_segs"}`

Value of the `Tcp: InSegs` field in `/proc/net/snmp`. Represents the number of receivied TCP packets.

Use `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` can calculate the error rate of received TCP packets.

Usually used to troubleshoot network problems.

### `doris_fe_snmp{name="tcp_out_segs"}`

Value of the `Tcp: OutSegs` field in `/proc/net/snmp`. Represents the number of send TCP packets with RST mark.

Use `(NEW_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` can calculate the retrans rate of TCP packets.

Usually used to troubleshoot network problems.
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,19 @@ BE 的监控项可以通过以下方式访问:
结合采样周期可以计算发生率。

通常用于排查网络问题。

### `doris_be_snmp{name="tcp_in_segs"}`

该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。

通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。

通常用于排查网络问题。

### `doris_be_snmp{name="tcp_out_segs"}`

该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。

通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。

通常用于排查网络问题。
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,19 @@ FE 的监控项可以通过以下方式访问:
结合采样周期可以计算发生率。

通常用于排查网络问题。

### `doris_fe_snmp{name="tcp_in_segs"}`

该监控项为 `/proc/net/snmp` 中的 `Tcp: InSegs` 字段值。表示当前接收到的所有 TCP 包的数量。

通过 `(NEW_tcp_in_errs - OLD_tcp_in_errs) / (NEW_tcp_in_segs - OLD_tcp_in_segs)` 可以计算接收到的 TCP 错误包率。

通常用于排查网络问题。

### `doris_fe_snmp{name="tcp_out_segs"}`

该监控项为 `/proc/net/snmp` 中的 `Tcp: OutSegs` 字段值。表示当前发送的所有带 RST 标记的 TCP 包的数量。

通过 `(NEW_tcp_tcp_retrans_segs - OLD_tcp_retrans_segs) / (NEW_tcp_out_segs - OLD_tcp_out_segs)` 可以计算 TCP 重传率。

通常用于排查网络问题。
22 changes: 22 additions & 0 deletions fe/src/main/java/org/apache/doris/metric/MetricRepo.java
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,28 @@ public Long getValue() {
};
tpcInErrs.addLabel(new MetricLabel("name", "tcp_in_errs"));
PALO_METRIC_REGISTER.addPaloMetrics(tpcInErrs);

// TCP inSegs
GaugeMetric<Long> tpcInSegs = (GaugeMetric<Long>) new GaugeMetric<Long>(
"snmp", MetricUnit.NOUNIT, "The number of all TCP packets received") {
@Override
public Long getValue() {
return SYSTEM_METRICS.tcpInSegs;
}
};
tpcInSegs.addLabel(new MetricLabel("name", "tcp_in_segs"));
PALO_METRIC_REGISTER.addPaloMetrics(tpcInSegs);

// TCP outSegs
GaugeMetric<Long> tpcOutSegs = (GaugeMetric<Long>) new GaugeMetric<Long>(
"snmp", MetricUnit.NOUNIT, "The number of all TCP packets send with RST") {
@Override
public Long getValue() {
return SYSTEM_METRICS.tcpOutSegs;
}
};
tpcOutSegs.addLabel(new MetricLabel("name", "tcp_out_segs"));
PALO_METRIC_REGISTER.addPaloMetrics(tpcOutSegs);
}

// to generate the metrics related to tablets of each backends
Expand Down
30 changes: 24 additions & 6 deletions fe/src/main/java/org/apache/doris/metric/SystemMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@

import org.apache.doris.common.FeConstants;

import com.google.common.collect.Maps;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.Map;

/**
* Save system metrics such as CPU, MEM, IO, Networks.
Expand All @@ -38,6 +41,10 @@ public class SystemMetrics {
protected long tcpRetransSegs = 0;
// The number of all problematic TCP packets received
protected long tcpInErrs = 0;
// All received TCP packets
protected long tcpInSegs = 0;
// All send TCP packets with RST mark
protected long tcpOutSegs = 0;

public synchronized void update() {
updateSnmpMetrics();
Expand All @@ -61,19 +68,30 @@ private void updateSnmpMetrics() {
if (!found) {
throw new Exception("can not find tcp metrics");
}
// skip tcp header line

// parse the header of TCP
String[] headers = line.split(" ");
Map<String, Integer> headerMap = Maps.newHashMap();
int pos = 0;
for (int i = 0; i < headers.length; i++) {
headerMap.put(headers[i], pos++);
}

// read the metrics of TCP
if ((line = br.readLine()) == null) {
throw new Exception("failed to skip tcp metrics header");
throw new Exception("failed to read metrics of TCP");
}

// eg: Tcp: 1 200 120000 -1 38920626 10487279 105581903 300009 305 18079291213 15411998945 11808180 22905 4174570 0
String[] parts = line.split(" ");
if (parts.length != 16) {
throw new Exception("invalid tcp metrics: " + line);
if (parts.length != headerMap.size()) {
throw new Exception("invalid tcp metrics: " + line + ". header size: " + headerMap.size());
}

tcpRetransSegs = Long.valueOf(parts[12]);
tcpInErrs = Long.valueOf(parts[13]);
tcpRetransSegs = Long.valueOf(parts[headerMap.get("RetransSegs")]);
tcpInErrs = Long.valueOf(parts[headerMap.get("InErrs")]);
tcpInSegs = Long.valueOf(parts[headerMap.get("InSegs")]);
tcpOutSegs = Long.valueOf(parts[headerMap.get("OutSegs")]);

} catch (Exception e) {
LOG.warn("failed to get /proc/net/snmp", e);
Expand Down