Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions be/src/util/system_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ struct NetMetrics {
METRIC_DEFINE_INT_LOCK_COUNTER(send_packets, MetricUnit::NUMBER);
};

// metrics read from /proc/net/snmp
struct SnmpMetrics {
// The number of all problematic TCP packets received
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_in_errs, MetricUnit::NUMBER);
// All TCP packets retransmitted
METRIC_DEFINE_INT_LOCK_COUNTER(tcp_retrans_segs, MetricUnit::NUMBER);
};

struct FileDescriptorMetrics {
METRIC_DEFINE_INT_GAUGE(fd_num_limit, MetricUnit::NUMBER);
METRIC_DEFINE_INT_GAUGE(fd_num_used, MetricUnit::NUMBER);
Expand Down Expand Up @@ -103,6 +111,7 @@ void SystemMetrics::install(MetricRegistry* registry,
_install_disk_metrics(registry, disk_devices);
_install_net_metrics(registry, network_interfaces);
_install_fd_metrics(registry);
_install_snmp_metrics(registry);
_registry = registry;
}

Expand All @@ -112,6 +121,7 @@ void SystemMetrics::update() {
_update_disk_metrics();
_update_net_metrics();
_update_fd_metrics();
_update_snmp_metrics();
}

void SystemMetrics::_install_cpu_metrics(MetricRegistry* registry) {
Expand All @@ -129,6 +139,7 @@ const char* k_ut_stat_path;
const char* k_ut_diskstats_path;
const char* k_ut_net_dev_path;
const char* k_ut_fd_path;
const char* k_ut_net_snmp_path;
#endif

void SystemMetrics::_update_cpu_metrics() {
Expand Down Expand Up @@ -304,6 +315,16 @@ void SystemMetrics::_install_net_metrics(MetricRegistry* registry,
}
}

void SystemMetrics::_install_snmp_metrics(MetricRegistry* registry) {
_snmp_metrics.reset(new SnmpMetrics());
#define REGISTER_SNMP_METRIC(name) \
registry->register_metric("snmp", \
MetricLabels().add("name", #name), \
&_snmp_metrics->name)
REGISTER_SNMP_METRIC(tcp_in_errs);
REGISTER_SNMP_METRIC(tcp_retrans_segs);
}

void SystemMetrics::_update_net_metrics() {
#ifdef BE_TEST
// to mock proc
Expand Down Expand Up @@ -399,6 +420,65 @@ void SystemMetrics::_update_net_metrics() {
fclose(fp);
}

void SystemMetrics::_update_snmp_metrics() {
#ifdef BE_TEST
// to mock proc
FILE* fp = fopen(k_ut_net_snmp_path, "r");
#else
FILE* fp = fopen("/proc/net/snmp", "r");
#endif
if (fp == nullptr) {
char buf[64];
LOG(WARNING) << "open /proc/net/snmp failed, errno=" << errno
<< ", message=" << strerror_r(errno, buf, 64);
return;
}

// We only care about Tcp lines, so skip other lines in front of Tcp line
int res = 0;
while ((res = getline(&_line_ptr, &_line_buf_size, fp)) > 0) {
if (strstr(_line_ptr, "Tcp") != nullptr) {
break;
}
}
if (res <= 0) {
char buf[64];
LOG(WARNING) << "failed to skip lines of /proc/net/snmp, errno=" << errno
<< ", message=" << strerror_r(errno, buf, 64);
fclose(fp);
return;
}

// skip the Tcp header line
// Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
if (getline(&_line_ptr, &_line_buf_size, fp) < 0) {
char buf[64];
LOG(WARNING) << "failed to skip Tcp header line of /proc/net/snmp, errno=" << errno
<< ", message=" << strerror_r(errno, buf, 64);
fclose(fp);
return;
}

// metric line looks like:
// Tcp: 1 200 120000 -1 47849374 38601877 3353843 2320314 276 1033354613 1166025166 825439 12694 23238924 0
int64_t retrans_segs = 0;
int64_t in_errs = 0;
sscanf(_line_ptr,
"Tcp: %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d"
" %" PRId64 " %" PRId64 " %*d %*d",
&retrans_segs, &in_errs);

_snmp_metrics->tcp_retrans_segs.set_value(retrans_segs);
_snmp_metrics->tcp_in_errs.set_value(in_errs);

if (ferror(fp) != 0) {
char buf[64];
LOG(WARNING) << "getline failed, errno=" << errno
<< ", message=" << strerror_r(errno, buf, 64);
}
fclose(fp);
}

void SystemMetrics::_install_fd_metrics(MetricRegistry* registry) {
_fd_metrics.reset(new FileDescriptorMetrics());
registry->register_metric("fd_num_limit", &_fd_metrics->fd_num_limit);
Expand Down
5 changes: 5 additions & 0 deletions be/src/util/system_metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class MemoryMetrics;
class DiskMetrics;
class NetMetrics;
class FileDescriptorMetrics;
class SnmpMetrics;

class SystemMetrics {
public:
Expand Down Expand Up @@ -76,6 +77,9 @@ class SystemMetrics {

void _update_fd_metrics();

void _install_snmp_metrics(MetricRegistry* registry);
void _update_snmp_metrics();

private:
static const char* _s_hook_name;

Expand All @@ -85,6 +89,7 @@ class SystemMetrics {
std::map<std::string, NetMetrics*> _net_metrics;
std::unique_ptr<FileDescriptorMetrics> _fd_metrics;
int _proc_net_dev_version = 0;
std::unique_ptr<SnmpMetrics> _snmp_metrics;

char* _line_ptr = nullptr;
size_t _line_buf_size = 0;
Expand Down
12 changes: 12 additions & 0 deletions be/test/util/system_metrics_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ extern const char* k_ut_stat_path;
extern const char* k_ut_diskstats_path;
extern const char* k_ut_net_dev_path;
extern const char* k_ut_fd_path;
extern const char* k_ut_net_snmp_path;

TEST_F(SystemMetricsTest, normal) {
MetricRegistry registry("test");
Expand All @@ -104,6 +105,9 @@ TEST_F(SystemMetricsTest, normal) {
std::string fd_path(dir_path);
fd_path += "/test_data/fd_file_nr";
k_ut_fd_path = fd_path.c_str();
std::string net_snmp_path(dir_path);
net_snmp_path += "/test_data/net_snmp_normal";
k_ut_net_snmp_path = net_snmp_path.c_str();

std::set<std::string> disk_devices;
disk_devices.emplace("sda");
Expand Down Expand Up @@ -219,6 +223,14 @@ TEST_F(SystemMetricsTest, normal) {
"fd_num_used");
ASSERT_TRUE(fd_metric != nullptr);
ASSERT_STREQ("19520", fd_metric->to_string().c_str());

// net snmp
Metric* tcp_retrans_segs = registry.get_metric("snmp", MetricLabels().add("name", "tcp_retrans_segs"));
ASSERT_TRUE(tcp_retrans_segs != nullptr);
Metric* tcp_in_errs = registry.get_metric("snmp", MetricLabels().add("name","tcp_in_errs"));
ASSERT_TRUE(tcp_in_errs != nullptr);
ASSERT_STREQ("826271", tcp_retrans_segs->to_string().c_str());
ASSERT_STREQ("12712", tcp_in_errs->to_string().c_str());
}
{
TestMetricsVisitor visitor;
Expand Down
12 changes: 12 additions & 0 deletions be/test/util/test_data/net_snmp_normal
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates
Ip: 1 64 1049877820 0 0 0 0 0 1049877596 1052780427 0 1317 0 0 0 0 0 0 0
Icmp: InMsgs InErrors InCsumErrors InDestUnreachs InTimeExcds InParmProbs InSrcQuenchs InRedirects InEchos InEchoReps InTimestamps InTimestampReps InAddrMasks InAddrMaskReps OutMsgs OutErrors OutDestUnreachs OutTimeExcds OutParmProbs OutSrcQuenchs OutRedirects OutEchos OutEchoReps OutTimestamps OutTimestampReps OutAddrMasks OutAddrMaskReps
Icmp: 1142563 126992 0 198790 26 0 0 0 153700 790046 1 0 0 0 1174563 0 198734 0 0 0 0 822128 153700 0 1 0 0
IcmpMsg: InType0 InType3 InType8 InType11 InType13 OutType0 OutType3 OutType8 OutType14
IcmpMsg: 790046 198790 153700 26 1 153700 198734 822128 1
Tcp: RtoAlgorithm RtoMin RtoMax MaxConn ActiveOpens PassiveOpens AttemptFails EstabResets CurrEstab InSegs OutSegs RetransSegs InErrs OutRsts InCsumErrors
Tcp: 1 200 120000 -1 47884867 38628916 3356043 2323781 278 1034019111 1166716939 826271 12712 23260066 0
Udp: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors
Udp: 14706122 9772 0 14917947 0 0 0
UdpLite: InDatagrams NoPorts InErrors OutDatagrams RcvbufErrors SndbufErrors InCsumErrors
UdpLite: 0 0 0 0 0 0 0
8 changes: 8 additions & 0 deletions docs/.vuepress/sidebar/en.js
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,14 @@ module.exports = [
"multi-tenant",
"tablet-meta-tool",
"tablet-repair-and-balance",
{
title: "Metrics",
directoryPath: "monitor-metrics/",
children: [
"fe-metrics",
"be-metrics",
],
},
],
sidebarDepth: 2,
},
Expand Down
8 changes: 8 additions & 0 deletions docs/.vuepress/sidebar/zh-CN.js
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@ module.exports = [
"tablet-meta-tool",
"tablet-repair-and-balance",
"tablet-restore-tool",
{
title: "监控项",
directoryPath: "monitor-metrics/",
children: [
"fe-metrics",
"be-metrics",
],
},
],
sidebarDepth: 2,
},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
{
"title": "BE Metrics",
"language": "en"
}
---

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

<!-- Please sort the metrics alphabetically -->

# BE Metrics

This document mainly introduces the monitor metrics of BE.

## View Metrics

BE metrics can be viewed by visiting:

`http://be_host:be_webserver_port/metrics`

The default format is of [Prometheus](https://prometheus.io/).

You can get Json format by visiting:

`http://be_host:be_webserver_port/metrics?type=agent`

## Metrics List

### `doris_be_snmp{name="tcp_in_errs"}`

Value of the `Tcp: InErrs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received.

The incidence rate can be calculated in combination with the sampling period.

Usually used to troubleshoot network problems.

### `doris_be_snmp{name="tcp_retrans_segs"}`

Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received.

The incidence rate can be calculated in combination with the sampling period.

Usually used to troubleshoot network problems.
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
---
{
"title": "FE Metrics",
"language": "en"
}
---

<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

<!-- Please sort the metrics alphabetically -->

# FE Metrics

This document mainly introduces the monitor metrics of FE.

## View Metrics

FE metrics can be viewed by visiting:

`http://fe_host:fe_http_port/metrics`

The default format is of [Prometheus](https://prometheus.io/).

You can get Json format by visiting:

`http://fe_host:fe_http_port/metrics?type=agent`

## Metrics List

### `doris_fe_snmp{name="tcp_in_errs"}`

Value of the `Tcp: InErrs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received.

The incidence rate can be calculated in combination with the sampling period.

Usually used to troubleshoot network problems.

### `doris_fe_snmp{name="tcp_retrans_segs"}`

Value of the `Tcp: RetransSegs` field in `/proc/net/snmp`. Represents the number of error TCP packets currently received.

The incidence rate can be calculated in combination with the sampling period.

Usually used to troubleshoot network problems.
Loading