Added dlsm::Thread::getaffinity and use MAdviseAllocator.

pkarneliuk · Mar 13, 2024 · 94412f8 · 94412f8
1 parent 3914e99
commit 94412f8
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 20 deletions.
diff --git a/include/impl/Thread.hpp b/include/impl/Thread.hpp
@@ -16,11 +16,12 @@ namespace dlsm::Thread {
 
 namespace dlsm::Thread {
 
-void name(const std::string& name, std::size_t native_handle = 0);
-std::string name(std::size_t native_handle = 0);
+void name(const std::string& name, std::size_t handle = 0);
+std::string name(std::size_t handle = 0);
 
-static constexpr std::size_t AllCPU = 0xFFFF;
-void affinity(std::size_t cpuid = AllCPU, std::size_t native_handle = 0);
+static constexpr std::size_t AllCPU = 0;
+void affinity(std::size_t cpuid = AllCPU, std::size_t handle = 0);
+std::size_t getaffinity(std::size_t handle = 0);
 
 template <typename T>
 concept Yield = requires(T c) { c.pause(); };

diff --git a/src/Thread.cpp b/src/Thread.cpp
@@ -6,42 +6,54 @@
 #include <ctime>
 #include <system_error>
 
+namespace {
+pthread_t tid(std::size_t handle = 0) { return (handle == 0) ? pthread_self() : static_cast<pthread_t>(handle); }
+}  // namespace
+
 namespace dlsm::Thread {
 
-void name(const std::string& name, std::size_t native_handle) {
-    auto thread = (native_handle == 0) ? pthread_self() : static_cast<pthread_t>(native_handle);
-    if (const int err = pthread_setname_np(thread, name.c_str())) {
+void name(const std::string& name, std::size_t handle) {
+    if (const int err = pthread_setname_np(tid(handle), name.c_str())) {
         throw std::system_error(err, std::generic_category(), "pthread_setname_np()");
     }
 }
 
-std::string name(std::size_t native_handle) {
-    auto thread = (native_handle == 0) ? pthread_self() : static_cast<pthread_t>(native_handle);
+std::string name(std::size_t handle) {
     std::string name(16, '\0');
-    if (const int err = pthread_getname_np(thread, std::data(name), std::size(name))) {
+    if (const int err = pthread_getname_np(tid(handle), std::data(name), std::size(name))) {
         throw std::system_error(err, std::generic_category(), "pthread_getname_np()");
     }
     name.resize(std::strlen(name.c_str()));
     return name;
 }
 
-void affinity(std::size_t cpuid, std::size_t native_handle) {
-    auto thread = (native_handle == 0) ? pthread_self() : static_cast<pthread_t>(native_handle);
+void affinity(std::size_t cpuid, std::size_t handle) {
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
     if (cpuid == AllCPU) {
-        for (std::size_t i = 0; i < sizeof(cpu_set_t) * 8; ++i) {
+        for (std::size_t i = 0; i < CPU_SETSIZE; ++i) {
             CPU_SET(i, &cpuset);
         }
     } else {
         CPU_SET(cpuid, &cpuset);
     }
 
-    if (const int err = ::pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset)) {
+    if (const int err = ::pthread_setaffinity_np(tid(handle), sizeof(cpuset), &cpuset)) {
         throw std::system_error(err, std::system_category(), "pthread_setaffinity_np()");
     }
 }
 
+std::size_t getaffinity(std::size_t handle) {
+    cpu_set_t cpuset;
+    if (const int err = ::pthread_getaffinity_np(tid(handle), sizeof(cpuset), &cpuset)) {
+        throw std::system_error(err, std::system_category(), "pthread_getaffinity_np()");
+    }
+    for (std::size_t i = 0; i < CPU_SETSIZE; ++i) {
+        if (CPU_ISSET(i, &cpuset)) return i;
+    }
+    return 0;
+}
+
 void NanoSleep::pause() noexcept {
     const timespec timeout = {0, 1};
     ::nanosleep(&timeout, nullptr);

diff --git a/tests/perf/PerfTransport.cpp b/tests/perf/PerfTransport.cpp
@@ -8,6 +8,7 @@
 #include <utility>
 #include <vector>
 
+#include "impl/Allocator.hpp"
 #include "impl/Clock.hpp"
 #include "impl/Thread.hpp"
 #include "impl/Transport.hpp"
@@ -36,6 +37,7 @@ void TransportPubSub(benchmark::State& state, Args&&... args) {
     // clang-format on
 
     auto runtime = dlsm::Transport<std::remove_pointer_t<decltype(type)>>(ropts);
+    const auto affinity = dlsm::Thread::getaffinity();
     std::mutex state_mutex;
     const auto synchronized = [&](auto action) {
         std::lock_guard<std::mutex> guard(state_mutex);
@@ -49,14 +51,18 @@ void TransportPubSub(benchmark::State& state, Args&&... args) {
         std::vector<std::jthread> threads{1 + subscribers};
         std::barrier sync(std::ssize(threads));
 
-        using NsList = std::vector<std::chrono::nanoseconds>;
+        // using NsList = std::vector<std::chrono::nanoseconds>;
+        using NsList = std::vector<std::chrono::nanoseconds, dlsm::MAdviseAllocator<std::chrono::nanoseconds>>;
+        // using NsList = std::vector<std::chrono::nanoseconds, dlsm::MmapAllocator<std::chrono::nanoseconds>>;
         std::vector<NsList> timestamps{std::size(threads), NsList(tosend, 0ns)};
 
         for (std::size_t i = 0; auto& t : threads) {
             t = std::jthread([&, i]() {
+                if (affinity) dlsm::Thread::affinity(affinity + i);
                 auto& ts = timestamps[i];
                 std::uint64_t count = 0;
                 if (i == 0) {
+                    dlsm::Thread::name("Pub");
                     auto pub = runtime.pub(popts);
 
                     sync.arrive_and_wait();
@@ -89,9 +95,10 @@ void TransportPubSub(benchmark::State& state, Args&&... args) {
                     last_sent = count;
                     sync.arrive_and_wait();
                 } else {
-                    std::size_t timeouts = 0;
-
+                    const auto name = "Sub" + std::to_string(i);
+                    dlsm::Thread::name(name);
                     auto sub = runtime.sub(sopts);
+                    std::size_t timeouts = 0;
 
                     sync.arrive_and_wait();
                     Event e{};
@@ -120,9 +127,8 @@ void TransportPubSub(benchmark::State& state, Args&&... args) {
                         }
                     }
                     synchronized([&] {
-                        if (const auto lost = tosend - count)
-                            state.counters["Sub" + std::to_string(i) + "Lost"] = static_cast<double>(lost);
-                        if (timeouts) state.counters["Sub" + std::to_string(i) + "TO"] = static_cast<double>(timeouts);
+                        if (const auto lost = tosend - count) state.counters[name + "Lost"] = static_cast<double>(lost);
+                        if (timeouts) state.counters[name + "TO"] = static_cast<double>(timeouts);
                     });
                     sync.arrive_and_wait();
                 }

diff --git a/tests/perf/README.md b/tests/perf/README.md
@@ -6,6 +6,10 @@ Performance tests focussed on microbenchmarking using [google/benchmark](https:/
 # Build in dir './build' perf and run Transport
 make -C ./build perf && ./build/tests/perf/perf --benchmark_filter=Transport --benchmark_counters_tabular=true
 
+# Run Transport under perf stat
+sudo perf stat                 ./build/tests/perf/perf --benchmark_filter=Transport --benchmark_counters_tabular=true
+sudo perf stat taskset -c 6-11 ./build/tests/perf/perf --benchmark_filter=Transport --benchmark_counters_tabular=true
+
 # Run 5 repetitions with 1 iteration benchmarking
 ./build/tests/perf/perf --benchmark_filter=*    \
     --benchmark_repetitions=5                   \
@@ -21,3 +25,29 @@ This script reads binary files with `int64` samples(nanoseconds timestamps), and
 # Display Pub1.ns as master and delays of Sub1/Sub2/Sub3/Sub4.ns signals relative to Pub1
 ./tests/perf/delays.py ./build/tests/perf/TransportPubSub-mem-*
 ```
+
+## Threads Affinity and CPU Core Isolation
+```sh
+lstopo-no-graphics --no-io --no-legend --of txt # Display layout of available CPUs in physical packages
+numactl --hardware # Display NUMA nodes
+sudo grubby --update-kernel=ALL --args="isolcpus=6-11" # Isolate CPU #6 - #11 from OS scheduling
+sudo grubby --update-kernel=ALL --remove-args="isolcpus=6-11"
+cat /proc/cmdline       # Display kernel startup parameters
+taskset -c 0,4,6-8 pid  # Setting CPU affinity
+```
+
+## HugePages Support
+```sh
+# Set limit of Huge Pages in system
+echo 1000 > /proc/sys/vm/nr_hugepages
+# Watch statistics of available Huge Pages
+watch -n 1 grep Huge /proc/meminfo
+watch -n 1 numastat -cm
+cat /sys/kernel/mm/transparent_hugepage/enabled
+always [madvise] never
+```
+
+
+## Links and References
+ - [Isolating CPUs using tuned-profiles-real-time](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux_for_real_time/8/html/optimizing_rhel_8_for_real_time_for_low_latency_operation/assembly_isolating-cpus-using-tuned-profiles-realtime_optimizing-rhel8-for-real-time-for-low-latency-operation)
+ - [Configuring HugeTLB Huge Pages](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/performance_tuning_guide/sect-red_hat_enterprise_linux-performance_tuning_guide-memory-configuring-huge-pages)
diff --git a/tests/unit/TestThread.cpp b/tests/unit/TestThread.cpp
@@ -46,7 +46,9 @@ TEST(Thread, Affinity) {
         });
 
         enter.wait();
+        EXPECT_EQ(dlsm::Thread::getaffinity(t.native_handle()), 0);
         dlsm::Thread::affinity(1ULL, t.native_handle());
+        EXPECT_EQ(dlsm::Thread::getaffinity(t.native_handle()), 1ULL);
         dlsm::Thread::affinity(dlsm::Thread::AllCPU, t.native_handle());
         exit.count_down();
     });