Skip to content

Commit

Permalink
Add rdma metrics monitoring (#320)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #320

Add RDMA / infiniband Metrics. This PR exports the key modules but does not enable it in the main module yet.
* Ethtool counters.
* SysFs based RDMA counters.

Will add full descriptions of added metrics in upcoming PR.

Reviewed By: sanrise

Differential Revision: D64711853

fbshipit-source-id: bd6a04099464b18d07832a8f1d63aad27bc97eba
  • Loading branch information
briancoutinho authored and facebook-github-bot committed Oct 31, 2024
1 parent 503f396 commit c72f39a
Show file tree
Hide file tree
Showing 13 changed files with 717 additions and 1 deletion.
3 changes: 3 additions & 0 deletions dynolog/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ target_link_libraries(dynolog_lib PUBLIC dynolog_ipcmonitor_lib)
add_subdirectory(gpumon)
target_link_libraries(dynolog_lib PUBLIC dynolog_dcgm_lib "-ldl")

add_subdirectory(rdmamon)
target_link_libraries(dynolog_lib PUBLIC dynolog_rdmamon_lib)

add_subdirectory(metric_frame)

add_executable(dynolog Main.cpp)
Expand Down
2 changes: 1 addition & 1 deletion dynolog/src/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

// Dynomin : A portable telemetry monitoring daemon.
// Dynolog : A portable telemetry monitoring daemon.

#include <gflags/gflags.h>
#include <glog/logging.h>
Expand Down
7 changes: 7 additions & 0 deletions dynolog/src/rdmamon/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.

file (GLOB dynolog_rdmamon_files "*.h" "*.cpp")
add_library(dynolog_rdmamon_lib ${dynolog_rdmamon_files})
target_link_libraries(dynolog_rdmamon_lib PUBLIC gflags::gflags)
target_link_libraries(dynolog_rdmamon_lib PUBLIC glog::glog)
target_link_libraries(dynolog_rdmamon_lib PUBLIC fmt::fmt)
66 changes: 66 additions & 0 deletions dynolog/src/rdmamon/EthtoolCounters.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.

// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

#include <fmt/format.h>
#include <gflags/gflags.h>

#include "dynolog/src/rdmamon/EthtoolCounters.h"

#ifdef FBCODE
#include "secure_lib/secure_string.h"
#endif // FBCODE

namespace dynolog {
namespace rdmamon {

bool EthtoolCounters::setupEthtoolCounters() {
const std::vector<std::string> eth_counter_names_ = {
"tx_pause_ctrl_phy",
"tx_prio0_pause",
"tx_prio1_pause",
"tx_prio2_pause",
"tx_prio3_pause",
"tx_prio4_pause",
"tx_prio5_pause",
"tx_prio6_pause",
"tx_prio7_pause",
"tx_pause_storm_warning_events",
"tx_pause_storm_error_events",
};
return setup_ethtool_counters(eth_counter_names_);
}

bool EthtoolCounters::sampleEthtoolCounters(
std::map<std::string, int64_t>& countersMap) {
if (!get_current_ethtool_counters()) {
return false;
}

if (!first_sample_) {
for (auto it = eth_counters_.begin(); it != eth_counters_.end(); it++) {
int64_t diff =
cur_eth_stats_->data[it->second] - prev_eth_stats_->data[it->second];
const auto key = fmt::format("{}.{}", ifname_, it->first);
countersMap[key] = diff;
}
}
first_sample_ = false;
size_t copy_sz =
(gstrings_->len * sizeof(uint64_t)) + sizeof(struct ethtool_stats);
#ifdef FBCODE
if (try_checked_memcpy(prev_eth_stats_, stats_sz_, cur_eth_stats_, copy_sz) !=
0) {
LOG_EVERY_N(WARNING, 100)
<< "Uanble to copy current stats due to insufficient space";
return false;
}
#else
memcpy(prev_eth_stats_, cur_eth_stats_, copy_sz);
#endif // FBCODE
return true;
}

} // namespace rdmamon
} // namespace dynolog
174 changes: 174 additions & 0 deletions dynolog/src/rdmamon/EthtoolCounters.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.

// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

#pragma once

#include <glog/logging.h>
#include <linux/ethtool.h>
#include <linux/sockios.h>
#include <net/if.h>
#include <netdb.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <cstring>
#include <map>
#include <string>

// @lint-ignore-every CLANGTIDY facebook-hte-BadCall-strerror

namespace dynolog {
namespace rdmamon {

class EthtoolCounters {
public:
explicit EthtoolCounters(const std::string& ifname) : ifname_(ifname) {}
virtual ~EthtoolCounters() {
teardown_ethtool_counters();
}

bool setupEthtoolCounters();
bool sampleEthtoolCounters(std::map<std::string, int64_t>& rdmaCounterMap);

private:
std::string ifname_;
int ioctl_sock_fd_;
struct ethtool_gstrings* gstrings_ = nullptr;
struct ethtool_stats *cur_eth_stats_ = nullptr, *prev_eth_stats_ = nullptr;
size_t stats_sz_ = 0;
struct ifreq ifr_;
bool first_sample_ = true;

std::map<std::string, int> eth_counters_;

[[nodiscard]] bool open_ioctl_socket() {
ioctl_sock_fd_ = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
if (ioctl_sock_fd_ < 0) {
LOG(ERROR) << "Unable to create socket (" << std::strerror(errno) << ")";
return false;
}
return true;
}

void close_ioctl_socket() {
if (ioctl_sock_fd_ >= 0) {
close(ioctl_sock_fd_);
ioctl_sock_fd_ = -1;
}
}

[[nodiscard]] bool setup_ethtool_counters(
const std::vector<std::string>& eth_counter_names_) {
struct {
struct ethtool_sset_info hdr;
uint32_t buf[1];
} ss_stats;
uint32_t ss_stats_len;

if (!open_ioctl_socket()) {
return false;
}

// Fetch how many stats will be returned
ss_stats.hdr.cmd = ETHTOOL_GSSET_INFO;
ss_stats.hdr.reserved = 0;
ss_stats.hdr.sset_mask = 1ULL << ETH_SS_STATS;
memset(&ifr_, 0, sizeof(ifr_));
strncpy(ifr_.ifr_name, ifname_.c_str(), sizeof(ifr_.ifr_name));
ifr_.ifr_data = (char*)&ss_stats;
if (ioctl(ioctl_sock_fd_, SIOCETHTOOL, &ifr_)) {
LOG(ERROR) << "IOCTL error for ETHTOOL_GSSET_INFO ("
<< std::strerror(errno) << ")";
return false;
}
ss_stats_len = ss_stats.hdr.sset_mask ? ss_stats.hdr.data[0] : 0;
if (ss_stats_len < 1) {
LOG(INFO) << "Cannot retrieve the stats information";
return false;
}
size_t gstrings_size =
sizeof(*gstrings_) + (ss_stats_len * ETH_GSTRING_LEN);
gstrings_ = (struct ethtool_gstrings*)calloc(1, gstrings_size);
if (!gstrings_) {
LOG(ERROR) << "Unable to allocate " << gstrings_size
<< " bytes for gstrings";
return false;
}

// Fetch the strings for each stats
gstrings_->cmd = ETHTOOL_GSTRINGS;
gstrings_->string_set = ETH_SS_STATS;
gstrings_->len = ss_stats_len;
memset(&ifr_.ifr_data, 0, sizeof(ifr_.ifr_data));
ifr_.ifr_data = (char*)gstrings_;
if (ioctl(ioctl_sock_fd_, SIOCETHTOOL, &ifr_)) {
LOG(ERROR) << "IOCTL error for ETHTOOL_GSTRINGS (" << std::strerror(errno)
<< ")";
return false;
}
memset(&ifr_.ifr_data, 0, sizeof(ifr_.ifr_data));

/* Allocate the memory for stats */
stats_sz_ =
(gstrings_->len * sizeof(uint64_t)) + sizeof(struct ethtool_stats);
cur_eth_stats_ = (struct ethtool_stats*)calloc(1, stats_sz_);
prev_eth_stats_ = (struct ethtool_stats*)calloc(1, stats_sz_);
if (!cur_eth_stats_ || !prev_eth_stats_) {
LOG(ERROR) << "Unable to allocate " << stats_sz_
<< " bytes of memory for eth_stats";
return false;
}
cur_eth_stats_->cmd = ETHTOOL_GSTATS;
cur_eth_stats_->n_stats = gstrings_->len;
ifr_.ifr_data = (char*)cur_eth_stats_;

for (auto eth_counter_name : eth_counter_names_) {
for (int j = 0; j < gstrings_->len; j++) {
if (0 ==
memcmp(
(void*)eth_counter_name.c_str(),
(void*)(&gstrings_->data[j * ETH_GSTRING_LEN]),
strlen(eth_counter_name.c_str()))) {
eth_counters_[eth_counter_name] = j;
}
}
}

return true;
}

void teardown_ethtool_counters() {
eth_counters_.clear();

if (gstrings_) {
free(gstrings_);
gstrings_ = nullptr;
}

if (cur_eth_stats_) {
free(cur_eth_stats_);
cur_eth_stats_ = nullptr;
}

if (prev_eth_stats_) {
free(prev_eth_stats_);
prev_eth_stats_ = nullptr;
}

close_ioctl_socket();
}

bool get_current_ethtool_counters() {
if (ioctl(ioctl_sock_fd_, SIOCETHTOOL, &ifr_)) {
LOG_EVERY_N(ERROR, 10) << "IOCTL error while getting ethtool counters ("
<< std::strerror(errno) << ")";
return false;
}
return true;
}
};

} // namespace rdmamon
} // namespace dynolog
74 changes: 74 additions & 0 deletions dynolog/src/rdmamon/RdmaCounters.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.

// This source code is licensed under the MIT license found in the
// LICENSE file in the root directory of this source tree.

#include "dynolog/src/rdmamon/RdmaCounters.h"
#include <fmt/format.h>
#include <cstdint>
#include <map>

namespace dynolog {
namespace rdmamon {

bool RdmaCounters::setupRdmaCounters() {
const std::vector<std::string> rdma_port_counters_ = {
"port_xmit_data",
"port_xmit_packets",
"port_xmit_discards",
"port_rcv_data",
"port_rcv_packets",
"port_rcv_errors",
};

const std::vector<std::string> rdma_hw_counters_ = {
"np_cnp_sent",
"rp_cnp_handled",
"np_ecn_marked_roce_packets",
"rx_atomic_requests",
"rx_read_requests",
"rx_write_requests",
};

return (
init_rdma_counters_(rdma_port_counter_path_, rdma_port_counters_) &&
init_rdma_counters_(rdma_hw_counter_path_, rdma_hw_counters_));
}

bool RdmaCounters::sampleRdmaCounters(
std::map<std::string, int64_t>& rdmaCountersMap) {
for (auto& rdma_counter : rdma_counters_) {
auto sysfs_counter = std::move(rdma_counter->sysfs_counter);
uint64_t prev_val = rdma_counter->prev;
auto val = sysfs_counter->getSysfsCounter();
uint64_t cur_val = (val) ? *val : prev_val;
if (!first_sample_) {
uint64_t diff = cur_val - prev_val;
DLOG(INFO) << sysfs_counter->getSysfsCounterName()
<< ": will return report value " << diff;
if (diff < 0) {
LOG(ERROR) << sysfs_counter->getSysfsCounterName()
<< ": current counter value " << cur_val
<< " is lower than previous counter value " << prev_val
<< " thus giving negative delta " << diff;
diff = 0;
} else {
const auto key = fmt::format(
"{}.{}",
std::string(ifname_),
sysfs_counter->getSysfsCounterName());
rdmaCountersMap[key] = diff;
DLOG(INFO) << "Value stored in map: " << rdmaCountersMap[key];
}
}
prev_val = cur_val;
rdma_counter->sysfs_counter = std::move(sysfs_counter);
rdma_counter->prev = prev_val;
rdma_counter->cur = cur_val;
}
first_sample_ = false;
return true;
}

} // namespace rdmamon
} // namespace dynolog
Loading

0 comments on commit c72f39a

Please sign in to comment.