pixie-io · JamesMBartlett · Jan 23, 2024 · Nov 30, 2023 · Dec 7, 2023 · Dec 7, 2023
@@ -35,7 +35,7 @@
 #include "src/stirling/upid/upid.h"
 
 // This keeps instruction count below BPF's limit of 4096 per probe.
-#define LOOP_LIMIT 42
+#define LOOP_LIMIT BPF_LOOP_LIMIT
 #define PROTOCOL_VEC_LIMIT 3
 
 const int32_t kInvalidFD = -1;

@@ -128,7 +128,7 @@ struct close_event_t {
 // This defines how many chunks a perf_submit can support.
 // This applies to messages that are over MAX_MSG_SIZE,
 // and effectively makes the maximum message size to be CHUNK_LIMIT*MAX_MSG_SIZE.
-#define CHUNK_LIMIT 4
+#define CHUNK_LIMIT BPF_CHUNK_LIMIT
 
 // Unique ID to all syscalls and a few other notable functions.
 // This applies to events sent to user-space.

@@ -28,6 +28,7 @@
 
 #include "src/common/fs/temp_file.h"
 #include "src/common/system/clock.h"
+#include "src/common/system/kernel_version.h"
 #include "src/common/system/tcp_socket.h"
 #include "src/common/system/udp_socket.h"
 #include "src/common/system/unix_socket.h"
@@ -490,9 +491,9 @@ TEST_F(SocketTraceBPFTest, LargeMessages) {
   std::string large_response =
       "HTTP/1.1 200 OK\r\n"
       "Content-Type: application/json; msg2\r\n"
-      "Content-Length: 131072\r\n"
+      "Content-Length: 3512768\r\n"
       "\r\n";
-  large_response += std::string(131072, '+');
+  large_response += std::string(3512768, '+');
 
   testing::SendRecvScript script({
       {{kHTTPReqMsg1}, {large_response}},
@@ -507,19 +508,29 @@ TEST_F(SocketTraceBPFTest, LargeMessages) {
                        GetMutableConnTracker(system.ClientPID(), system.ClientFD()));
   EXPECT_EQ(client_tracker->send_data().data_buffer().Head(), kHTTPReqMsg1);
   std::string client_recv_data(client_tracker->recv_data().data_buffer().Head());
-  EXPECT_THAT(client_recv_data.size(), 131153);
+  EXPECT_THAT(client_recv_data.size(), 3512850);
   EXPECT_THAT(client_recv_data, HasSubstr("+++++"));
   EXPECT_EQ(client_recv_data.substr(client_recv_data.size() - 5, 5), "+++++");
 
-  // The server's send syscall transmits all 131153 bytes in one shot.
+  // The server's send syscall transmits all 3512850 bytes in one shot.
   // This is over the limit that we can transmit through BPF, and so we expect
-  // filler bytes on this side of the connection. Note that the client doesn't have the
+  // filler bytes on this side of the connection (up to 1MB). Note that the client doesn't have the
   // same behavior, because the recv syscall provides the data in chunks.
   ASSERT_OK_AND_ASSIGN(auto* server_tracker,
                        GetMutableConnTracker(system.ServerPID(), system.ServerFD()));
   EXPECT_EQ(server_tracker->recv_data().data_buffer().Head(), kHTTPReqMsg1);
   std::string server_send_data(server_tracker->send_data().data_buffer().Head());
-  EXPECT_THAT(server_send_data.size(), 131153);
+  auto kernel = system::GetCachedKernelVersion();
+  bool kernelGreaterThan5_1 = kernel.version >= 5 || (kernel.version == 5 && kernel.major_rev >= 1);
+  if (kernelGreaterThan5_1) {
+    // CHUNK_LIMIT * MAX_MSG_SIZE bytes + up to 1MB filler.
+    // Currently 84*30720 + 932,370 filler bytes = 3,512,850
+    EXPECT_THAT(server_send_data.size(), 3512850);
+  } else {
+    // CHUNK_LIMIT * MAX_MSG_SIZE bytes + up to 1MB filler.
+    // Currently 4*30720 + 1024*1024 = 1,171,456, leaving gap of 2,341,394 bytes
+    EXPECT_THAT(server_send_data.size(), 1171456);
+  }
   EXPECT_THAT(server_send_data, HasSubstr("+++++"));
   // We expect filling with \0 bytes.
   EXPECT_EQ(server_send_data.substr(server_send_data.size() - 5, 5), ConstStringView("\0\0\0\0\0"));

@@ -31,6 +31,7 @@
 #include <google/protobuf/text_format.h>
 #include <google/protobuf/util/delimited_message_util.h>
 #include <magic_enum.hpp>
+#include "src/common/system/kernel_version.h"
 
 #include "src/common/base/base.h"
 #include "src/common/base/utils.h"
@@ -175,6 +176,15 @@ DEFINE_bool(
     stirling_debug_tls_sources, gflags::BoolFromEnv("PX_DEBUG_TLS_SOURCES", false),
     "If true, stirling will add additional prometheus metrics regarding the traced tls sources");
 
+DEFINE_uint32(stirling_bpf_loop_limit, 42,
+              "The maximum number of iovecs to capture for syscalls. "
+              "Set conservatively for older kernels by default to keep the instruction count below "
+              "BPF's limit for version 4 kernels (4096 per probe).");
+
+DEFINE_uint32(stirling_bpf_chunk_limit, 4,
+              "The maximum number of chunks a perf_submit can support. "
+              "This applies to messages that are over MAX_MSG_SIZE.");
+
 OBJ_STRVIEW(socket_trace_bcc_script, socket_trace);
 
 namespace px {
@@ -431,6 +441,18 @@ auto SocketTraceConnector::InitPerfBufferSpecs() {
 }
 
 Status SocketTraceConnector::InitBPF() {
+  // set BPF loop limit and chunk limit based on kernel version
+  auto kernel = system::GetCachedKernelVersion();
+  if (kernel.version >= 5 || (kernel.version == 5 && kernel.major_rev >= 1)) {
+    // Kernels >= 5.1 have higher BPF instruction limits (1 million for verifier).
+    // This enables a 21x increase to our loop and chunk limits
+    FLAGS_stirling_bpf_loop_limit = 882;
+    FLAGS_stirling_bpf_chunk_limit = 84;
+    LOG(INFO) << absl::Substitute(
+        "Kernel version greater than V5.1 detected ($0), raised loop limit to $1 and chunk limit "
+        "to $2",
+        kernel.ToString(), FLAGS_stirling_bpf_loop_limit, FLAGS_stirling_bpf_chunk_limit);
+  }
   // PROTOCOL_LIST: Requires update on new protocols.
   std::vector<std::string> defines = {
       absl::StrCat("-DENABLE_TLS_DEBUG_SOURCES=", FLAGS_stirling_debug_tls_sources),
@@ -445,6 +467,8 @@ Status SocketTraceConnector::InitBPF() {
       absl::StrCat("-DENABLE_NATS_TRACING=", protocol_transfer_specs_[kProtocolNATS].enabled),
       absl::StrCat("-DENABLE_AMQP_TRACING=", protocol_transfer_specs_[kProtocolAMQP].enabled),
       absl::StrCat("-DENABLE_MONGO_TRACING=", protocol_transfer_specs_[kProtocolMongo].enabled),
+      absl::StrCat("-DBPF_LOOP_LIMIT=", FLAGS_stirling_bpf_loop_limit),
+      absl::StrCat("-DBPF_CHUNK_LIMIT=", FLAGS_stirling_bpf_chunk_limit),
   };
   PX_RETURN_IF_ERROR(bcc_->InitBPFProgram(socket_trace_bcc_script, defines));