-
Notifications
You must be signed in to change notification settings - Fork 42
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Summary: Pull Request resolved: #320 Add RDMA / infiniband Metrics. This PR exports the key modules but does not enable it in the main module yet. * Ethtool counters. * SysFs based RDMA counters. Will add full descriptions of added metrics in upcoming PR. Reviewed By: sanrise Differential Revision: D64711853 fbshipit-source-id: bd6a04099464b18d07832a8f1d63aad27bc97eba
- Loading branch information
1 parent
503f396
commit c72f39a
Showing
13 changed files
with
717 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
|
||
file (GLOB dynolog_rdmamon_files "*.h" "*.cpp") | ||
add_library(dynolog_rdmamon_lib ${dynolog_rdmamon_files}) | ||
target_link_libraries(dynolog_rdmamon_lib PUBLIC gflags::gflags) | ||
target_link_libraries(dynolog_rdmamon_lib PUBLIC glog::glog) | ||
target_link_libraries(dynolog_rdmamon_lib PUBLIC fmt::fmt) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright (c) Meta Platforms, Inc. and affiliates. | ||
|
||
// This source code is licensed under the MIT license found in the | ||
// LICENSE file in the root directory of this source tree. | ||
|
||
#include <fmt/format.h> | ||
#include <gflags/gflags.h> | ||
|
||
#include "dynolog/src/rdmamon/EthtoolCounters.h" | ||
|
||
#ifdef FBCODE | ||
#include "secure_lib/secure_string.h" | ||
#endif // FBCODE | ||
|
||
namespace dynolog { | ||
namespace rdmamon { | ||
|
||
bool EthtoolCounters::setupEthtoolCounters() { | ||
const std::vector<std::string> eth_counter_names_ = { | ||
"tx_pause_ctrl_phy", | ||
"tx_prio0_pause", | ||
"tx_prio1_pause", | ||
"tx_prio2_pause", | ||
"tx_prio3_pause", | ||
"tx_prio4_pause", | ||
"tx_prio5_pause", | ||
"tx_prio6_pause", | ||
"tx_prio7_pause", | ||
"tx_pause_storm_warning_events", | ||
"tx_pause_storm_error_events", | ||
}; | ||
return setup_ethtool_counters(eth_counter_names_); | ||
} | ||
|
||
bool EthtoolCounters::sampleEthtoolCounters( | ||
std::map<std::string, int64_t>& countersMap) { | ||
if (!get_current_ethtool_counters()) { | ||
return false; | ||
} | ||
|
||
if (!first_sample_) { | ||
for (auto it = eth_counters_.begin(); it != eth_counters_.end(); it++) { | ||
int64_t diff = | ||
cur_eth_stats_->data[it->second] - prev_eth_stats_->data[it->second]; | ||
const auto key = fmt::format("{}.{}", ifname_, it->first); | ||
countersMap[key] = diff; | ||
} | ||
} | ||
first_sample_ = false; | ||
size_t copy_sz = | ||
(gstrings_->len * sizeof(uint64_t)) + sizeof(struct ethtool_stats); | ||
#ifdef FBCODE | ||
if (try_checked_memcpy(prev_eth_stats_, stats_sz_, cur_eth_stats_, copy_sz) != | ||
0) { | ||
LOG_EVERY_N(WARNING, 100) | ||
<< "Uanble to copy current stats due to insufficient space"; | ||
return false; | ||
} | ||
#else | ||
memcpy(prev_eth_stats_, cur_eth_stats_, copy_sz); | ||
#endif // FBCODE | ||
return true; | ||
} | ||
|
||
} // namespace rdmamon | ||
} // namespace dynolog |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
// Copyright (c) Meta Platforms, Inc. and affiliates. | ||
|
||
// This source code is licensed under the MIT license found in the | ||
// LICENSE file in the root directory of this source tree. | ||
|
||
#pragma once | ||
|
||
#include <glog/logging.h> | ||
#include <linux/ethtool.h> | ||
#include <linux/sockios.h> | ||
#include <net/if.h> | ||
#include <netdb.h> | ||
#include <sys/ioctl.h> | ||
#include <sys/stat.h> | ||
#include <unistd.h> | ||
#include <cstring> | ||
#include <map> | ||
#include <string> | ||
|
||
// @lint-ignore-every CLANGTIDY facebook-hte-BadCall-strerror | ||
|
||
namespace dynolog { | ||
namespace rdmamon { | ||
|
||
class EthtoolCounters { | ||
public: | ||
explicit EthtoolCounters(const std::string& ifname) : ifname_(ifname) {} | ||
virtual ~EthtoolCounters() { | ||
teardown_ethtool_counters(); | ||
} | ||
|
||
bool setupEthtoolCounters(); | ||
bool sampleEthtoolCounters(std::map<std::string, int64_t>& rdmaCounterMap); | ||
|
||
private: | ||
std::string ifname_; | ||
int ioctl_sock_fd_; | ||
struct ethtool_gstrings* gstrings_ = nullptr; | ||
struct ethtool_stats *cur_eth_stats_ = nullptr, *prev_eth_stats_ = nullptr; | ||
size_t stats_sz_ = 0; | ||
struct ifreq ifr_; | ||
bool first_sample_ = true; | ||
|
||
std::map<std::string, int> eth_counters_; | ||
|
||
[[nodiscard]] bool open_ioctl_socket() { | ||
ioctl_sock_fd_ = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); | ||
if (ioctl_sock_fd_ < 0) { | ||
LOG(ERROR) << "Unable to create socket (" << std::strerror(errno) << ")"; | ||
return false; | ||
} | ||
return true; | ||
} | ||
|
||
void close_ioctl_socket() { | ||
if (ioctl_sock_fd_ >= 0) { | ||
close(ioctl_sock_fd_); | ||
ioctl_sock_fd_ = -1; | ||
} | ||
} | ||
|
||
[[nodiscard]] bool setup_ethtool_counters( | ||
const std::vector<std::string>& eth_counter_names_) { | ||
struct { | ||
struct ethtool_sset_info hdr; | ||
uint32_t buf[1]; | ||
} ss_stats; | ||
uint32_t ss_stats_len; | ||
|
||
if (!open_ioctl_socket()) { | ||
return false; | ||
} | ||
|
||
// Fetch how many stats will be returned | ||
ss_stats.hdr.cmd = ETHTOOL_GSSET_INFO; | ||
ss_stats.hdr.reserved = 0; | ||
ss_stats.hdr.sset_mask = 1ULL << ETH_SS_STATS; | ||
memset(&ifr_, 0, sizeof(ifr_)); | ||
strncpy(ifr_.ifr_name, ifname_.c_str(), sizeof(ifr_.ifr_name)); | ||
ifr_.ifr_data = (char*)&ss_stats; | ||
if (ioctl(ioctl_sock_fd_, SIOCETHTOOL, &ifr_)) { | ||
LOG(ERROR) << "IOCTL error for ETHTOOL_GSSET_INFO (" | ||
<< std::strerror(errno) << ")"; | ||
return false; | ||
} | ||
ss_stats_len = ss_stats.hdr.sset_mask ? ss_stats.hdr.data[0] : 0; | ||
if (ss_stats_len < 1) { | ||
LOG(INFO) << "Cannot retrieve the stats information"; | ||
return false; | ||
} | ||
size_t gstrings_size = | ||
sizeof(*gstrings_) + (ss_stats_len * ETH_GSTRING_LEN); | ||
gstrings_ = (struct ethtool_gstrings*)calloc(1, gstrings_size); | ||
if (!gstrings_) { | ||
LOG(ERROR) << "Unable to allocate " << gstrings_size | ||
<< " bytes for gstrings"; | ||
return false; | ||
} | ||
|
||
// Fetch the strings for each stats | ||
gstrings_->cmd = ETHTOOL_GSTRINGS; | ||
gstrings_->string_set = ETH_SS_STATS; | ||
gstrings_->len = ss_stats_len; | ||
memset(&ifr_.ifr_data, 0, sizeof(ifr_.ifr_data)); | ||
ifr_.ifr_data = (char*)gstrings_; | ||
if (ioctl(ioctl_sock_fd_, SIOCETHTOOL, &ifr_)) { | ||
LOG(ERROR) << "IOCTL error for ETHTOOL_GSTRINGS (" << std::strerror(errno) | ||
<< ")"; | ||
return false; | ||
} | ||
memset(&ifr_.ifr_data, 0, sizeof(ifr_.ifr_data)); | ||
|
||
/* Allocate the memory for stats */ | ||
stats_sz_ = | ||
(gstrings_->len * sizeof(uint64_t)) + sizeof(struct ethtool_stats); | ||
cur_eth_stats_ = (struct ethtool_stats*)calloc(1, stats_sz_); | ||
prev_eth_stats_ = (struct ethtool_stats*)calloc(1, stats_sz_); | ||
if (!cur_eth_stats_ || !prev_eth_stats_) { | ||
LOG(ERROR) << "Unable to allocate " << stats_sz_ | ||
<< " bytes of memory for eth_stats"; | ||
return false; | ||
} | ||
cur_eth_stats_->cmd = ETHTOOL_GSTATS; | ||
cur_eth_stats_->n_stats = gstrings_->len; | ||
ifr_.ifr_data = (char*)cur_eth_stats_; | ||
|
||
for (auto eth_counter_name : eth_counter_names_) { | ||
for (int j = 0; j < gstrings_->len; j++) { | ||
if (0 == | ||
memcmp( | ||
(void*)eth_counter_name.c_str(), | ||
(void*)(&gstrings_->data[j * ETH_GSTRING_LEN]), | ||
strlen(eth_counter_name.c_str()))) { | ||
eth_counters_[eth_counter_name] = j; | ||
} | ||
} | ||
} | ||
|
||
return true; | ||
} | ||
|
||
void teardown_ethtool_counters() { | ||
eth_counters_.clear(); | ||
|
||
if (gstrings_) { | ||
free(gstrings_); | ||
gstrings_ = nullptr; | ||
} | ||
|
||
if (cur_eth_stats_) { | ||
free(cur_eth_stats_); | ||
cur_eth_stats_ = nullptr; | ||
} | ||
|
||
if (prev_eth_stats_) { | ||
free(prev_eth_stats_); | ||
prev_eth_stats_ = nullptr; | ||
} | ||
|
||
close_ioctl_socket(); | ||
} | ||
|
||
bool get_current_ethtool_counters() { | ||
if (ioctl(ioctl_sock_fd_, SIOCETHTOOL, &ifr_)) { | ||
LOG_EVERY_N(ERROR, 10) << "IOCTL error while getting ethtool counters (" | ||
<< std::strerror(errno) << ")"; | ||
return false; | ||
} | ||
return true; | ||
} | ||
}; | ||
|
||
} // namespace rdmamon | ||
} // namespace dynolog |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
// Copyright (c) Meta Platforms, Inc. and affiliates. | ||
|
||
// This source code is licensed under the MIT license found in the | ||
// LICENSE file in the root directory of this source tree. | ||
|
||
#include "dynolog/src/rdmamon/RdmaCounters.h" | ||
#include <fmt/format.h> | ||
#include <cstdint> | ||
#include <map> | ||
|
||
namespace dynolog { | ||
namespace rdmamon { | ||
|
||
bool RdmaCounters::setupRdmaCounters() { | ||
const std::vector<std::string> rdma_port_counters_ = { | ||
"port_xmit_data", | ||
"port_xmit_packets", | ||
"port_xmit_discards", | ||
"port_rcv_data", | ||
"port_rcv_packets", | ||
"port_rcv_errors", | ||
}; | ||
|
||
const std::vector<std::string> rdma_hw_counters_ = { | ||
"np_cnp_sent", | ||
"rp_cnp_handled", | ||
"np_ecn_marked_roce_packets", | ||
"rx_atomic_requests", | ||
"rx_read_requests", | ||
"rx_write_requests", | ||
}; | ||
|
||
return ( | ||
init_rdma_counters_(rdma_port_counter_path_, rdma_port_counters_) && | ||
init_rdma_counters_(rdma_hw_counter_path_, rdma_hw_counters_)); | ||
} | ||
|
||
bool RdmaCounters::sampleRdmaCounters( | ||
std::map<std::string, int64_t>& rdmaCountersMap) { | ||
for (auto& rdma_counter : rdma_counters_) { | ||
auto sysfs_counter = std::move(rdma_counter->sysfs_counter); | ||
uint64_t prev_val = rdma_counter->prev; | ||
auto val = sysfs_counter->getSysfsCounter(); | ||
uint64_t cur_val = (val) ? *val : prev_val; | ||
if (!first_sample_) { | ||
uint64_t diff = cur_val - prev_val; | ||
DLOG(INFO) << sysfs_counter->getSysfsCounterName() | ||
<< ": will return report value " << diff; | ||
if (diff < 0) { | ||
LOG(ERROR) << sysfs_counter->getSysfsCounterName() | ||
<< ": current counter value " << cur_val | ||
<< " is lower than previous counter value " << prev_val | ||
<< " thus giving negative delta " << diff; | ||
diff = 0; | ||
} else { | ||
const auto key = fmt::format( | ||
"{}.{}", | ||
std::string(ifname_), | ||
sysfs_counter->getSysfsCounterName()); | ||
rdmaCountersMap[key] = diff; | ||
DLOG(INFO) << "Value stored in map: " << rdmaCountersMap[key]; | ||
} | ||
} | ||
prev_val = cur_val; | ||
rdma_counter->sysfs_counter = std::move(sysfs_counter); | ||
rdma_counter->prev = prev_val; | ||
rdma_counter->cur = cur_val; | ||
} | ||
first_sample_ = false; | ||
return true; | ||
} | ||
|
||
} // namespace rdmamon | ||
} // namespace dynolog |
Oops, something went wrong.