Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise loop and chunk limit for newer kernels #1795

Merged
merged 4 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
#include "src/stirling/upid/upid.h"

// This keeps instruction count below BPF's limit of 4096 per probe.
#define LOOP_LIMIT 42
#define LOOP_LIMIT BPF_LOOP_LIMIT
#define PROTOCOL_VEC_LIMIT 3

const int32_t kInvalidFD = -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ struct close_event_t {
// This defines how many chunks a perf_submit can support.
// This applies to messages that are over MAX_MSG_SIZE,
// and effectively makes the maximum message size to be CHUNK_LIMIT*MAX_MSG_SIZE.
#define CHUNK_LIMIT 4
#define CHUNK_LIMIT BPF_CHUNK_LIMIT

// Unique ID to all syscalls and a few other notable functions.
// This applies to events sent to user-space.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include "src/common/fs/temp_file.h"
#include "src/common/system/clock.h"
#include "src/common/system/kernel_version.h"
#include "src/common/system/tcp_socket.h"
#include "src/common/system/udp_socket.h"
#include "src/common/system/unix_socket.h"
Expand Down Expand Up @@ -490,9 +491,9 @@ TEST_F(SocketTraceBPFTest, LargeMessages) {
std::string large_response =
"HTTP/1.1 200 OK\r\n"
"Content-Type: application/json; msg2\r\n"
"Content-Length: 131072\r\n"
"Content-Length: 3512768\r\n"
"\r\n";
large_response += std::string(131072, '+');
large_response += std::string(3512768, '+');

testing::SendRecvScript script({
{{kHTTPReqMsg1}, {large_response}},
Expand All @@ -507,19 +508,29 @@ TEST_F(SocketTraceBPFTest, LargeMessages) {
GetMutableConnTracker(system.ClientPID(), system.ClientFD()));
EXPECT_EQ(client_tracker->send_data().data_buffer().Head(), kHTTPReqMsg1);
std::string client_recv_data(client_tracker->recv_data().data_buffer().Head());
EXPECT_THAT(client_recv_data.size(), 131153);
EXPECT_THAT(client_recv_data.size(), 3512850);
EXPECT_THAT(client_recv_data, HasSubstr("+++++"));
EXPECT_EQ(client_recv_data.substr(client_recv_data.size() - 5, 5), "+++++");

// The server's send syscall transmits all 131153 bytes in one shot.
// The server's send syscall transmits all 3512850 bytes in one shot.
// This is over the limit that we can transmit through BPF, and so we expect
// filler bytes on this side of the connection. Note that the client doesn't have the
// filler bytes on this side of the connection (up to 1MB). Note that the client doesn't have the
// same behavior, because the recv syscall provides the data in chunks.
ASSERT_OK_AND_ASSIGN(auto* server_tracker,
GetMutableConnTracker(system.ServerPID(), system.ServerFD()));
EXPECT_EQ(server_tracker->recv_data().data_buffer().Head(), kHTTPReqMsg1);
std::string server_send_data(server_tracker->send_data().data_buffer().Head());
EXPECT_THAT(server_send_data.size(), 131153);
auto kernel = system::GetCachedKernelVersion();
bool kernelGreaterThan5_1 = kernel.version >= 5 || (kernel.version == 5 && kernel.major_rev >= 1);
if (kernelGreaterThan5_1) {
// CHUNK_LIMIT * MAX_MSG_SIZE bytes + up to 1MB filler.
// Currently 84*30720 + 932,370 filler bytes = 3,512,850
EXPECT_THAT(server_send_data.size(), 3512850);
} else {
// CHUNK_LIMIT * MAX_MSG_SIZE bytes + up to 1MB filler.
// Currently 4*30720 + 1024*1024 = 1,171,456, leaving gap of 2,341,394 bytes
EXPECT_THAT(server_send_data.size(), 1171456);
}
EXPECT_THAT(server_send_data, HasSubstr("+++++"));
// We expect filling with \0 bytes.
EXPECT_EQ(server_send_data.substr(server_send_data.size() - 5, 5), ConstStringView("\0\0\0\0\0"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <google/protobuf/text_format.h>
#include <google/protobuf/util/delimited_message_util.h>
#include <magic_enum.hpp>
#include "src/common/system/kernel_version.h"

#include "src/common/base/base.h"
#include "src/common/base/utils.h"
Expand Down Expand Up @@ -175,6 +176,15 @@ DEFINE_bool(
stirling_debug_tls_sources, gflags::BoolFromEnv("PX_DEBUG_TLS_SOURCES", false),
"If true, stirling will add additional prometheus metrics regarding the traced tls sources");

DEFINE_uint32(stirling_bpf_loop_limit, 42,
"The maximum number of iovecs to capture for syscalls. "
"Set conservatively for older kernels by default to keep the instruction count below "
"BPF's limit for version 4 kernels (4096 per probe).");

DEFINE_uint32(stirling_bpf_chunk_limit, 4,
"The maximum number of chunks a perf_submit can support. "
"This applies to messages that are over MAX_MSG_SIZE.");

OBJ_STRVIEW(socket_trace_bcc_script, socket_trace);

namespace px {
Expand Down Expand Up @@ -431,6 +441,18 @@ auto SocketTraceConnector::InitPerfBufferSpecs() {
}

Status SocketTraceConnector::InitBPF() {
// set BPF loop limit and chunk limit based on kernel version
auto kernel = system::GetCachedKernelVersion();
if (kernel.version >= 5 || (kernel.version == 5 && kernel.major_rev >= 1)) {
// Kernels >= 5.1 have higher BPF instruction limits (1 million for verifier).
// This enables a 21x increase to our loop and chunk limits
FLAGS_stirling_bpf_loop_limit = 882;
FLAGS_stirling_bpf_chunk_limit = 84;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did you pick these numbers? How close are we to the instruction limit with these numbers?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I manually raised the loop/chunk limits to see when the verifier would throw an error and chose values just below that upper bound, which is around 22x our previous limits.

bpf: Argument list too long. Program too large (... insns), at most 4096 insns

Confusingly, the bpf(2) syscall returns the same error whether the program size or its complexity exceeds the limits. See stackoverflow

LOG(INFO) << absl::Substitute(
"Kernel version greater than V5.1 detected ($0), raised loop limit to $1 and chunk limit "
"to $2",
kernel.ToString(), FLAGS_stirling_bpf_loop_limit, FLAGS_stirling_bpf_chunk_limit);
}
// PROTOCOL_LIST: Requires update on new protocols.
std::vector<std::string> defines = {
absl::StrCat("-DENABLE_TLS_DEBUG_SOURCES=", FLAGS_stirling_debug_tls_sources),
Expand All @@ -445,6 +467,8 @@ Status SocketTraceConnector::InitBPF() {
absl::StrCat("-DENABLE_NATS_TRACING=", protocol_transfer_specs_[kProtocolNATS].enabled),
absl::StrCat("-DENABLE_AMQP_TRACING=", protocol_transfer_specs_[kProtocolAMQP].enabled),
absl::StrCat("-DENABLE_MONGO_TRACING=", protocol_transfer_specs_[kProtocolMongo].enabled),
absl::StrCat("-DBPF_LOOP_LIMIT=", FLAGS_stirling_bpf_loop_limit),
absl::StrCat("-DBPF_CHUNK_LIMIT=", FLAGS_stirling_bpf_chunk_limit),
};
PX_RETURN_IF_ERROR(bcc_->InitBPFProgram(socket_trace_bcc_script, defines));

Expand Down
Loading