added CachelineAlignment to the queue traits

The purpose of the cacheline-size padding between class members is to avoid false sharing when threads modify the atomic members, but they do not have to be aligned exactly to the cacheline-size. The atomic members are very small (just 4 or 8 bytes) compared to the cacheline. As far as I can tell, their explicit alignment does not matter, they can be placed anywhere on the cacheline - we just need to ensure they are mapped to different cachelines. To avoid problems with misaligned dynamic allocation, the alignment must not be stricter than alignof(std::max_align_t), which is 16 bytes on x86_64 Linux. The value was added to the traits structs. Fixes d36u9#1
lahwaacz · Dec 31, 2020 · 8378bbe · 8378bbe
1 parent c76224b
commit 8378bbe
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 18 deletions.
diff --git a/async/bounded_queue.h b/async/bounded_queue.h
@@ -17,6 +17,7 @@ namespace async {
 struct bounded_traits {
   static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag
   static constexpr size_t CachelineSize = 64;
+  static constexpr size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1
   using sequence_type = uint64_t;
 };
 
@@ -27,6 +28,7 @@ template <typename T, typename TRAITS = bounded_traits> class bounded_queue {
 
 public:
   static constexpr size_t cacheline_size = TRAITS::CachelineSize;
+  static constexpr size_t cacheline_alignment = TRAITS::CachelineAlignment;
   using seq_t = typename TRAITS::sequence_type;
   explicit bounded_queue(size_t size)
       : fastmodulo((size > 0 && ((size & (size - 1)) == 0))),
@@ -331,10 +333,10 @@ template <typename T, typename TRAITS = bounded_traits> class bounded_queue {
   element *const elements; // pointer to buffer
   size_t const mask;       // used if fastmodulo is true
   size_t const qsize;      // queue size
-  alignas(cacheline_size) char cacheline_padding1[cacheline_size];
-  alignas(cacheline_size) std::atomic<seq_t> enqueueIx;
-  alignas(cacheline_size) char cacheline_padding2[cacheline_size];
-  alignas(cacheline_size) std::atomic<seq_t> dequeueIx;
-  alignas(cacheline_size) char cacheline_padding3[cacheline_size];
+  alignas(cacheline_alignment) char cacheline_padding1[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<seq_t> enqueueIx;
+  alignas(cacheline_alignment) char cacheline_padding2[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<seq_t> dequeueIx;
+  alignas(cacheline_alignment) char cacheline_padding3[cacheline_size];
 };
 } // namespace async
diff --git a/async/queue.h b/async/queue.h
@@ -21,6 +21,7 @@ struct traits // 3-level (L3, L2, L1) depth of nested group design, total
   static constexpr uint64_t Basebits = 8;
   static constexpr bool NOEXCEPT_CHECK = false; // exception handling flag
   static constexpr size_t CachelineSize = 64;
+  static constexpr size_t CachelineAlignment = 16; // must not be larger than alignof(std::max_align_t), see issue #1
 };
 
 template <typename T, typename TRAITS = traits> class queue final {
@@ -29,6 +30,7 @@ template <typename T, typename TRAITS = traits> class queue final {
     return std::atomic<uint64_t>{}.is_lock_free();
   }
   static constexpr size_t cacheline_size = TRAITS::CachelineSize;
+  static constexpr size_t cacheline_alignment = TRAITS::CachelineAlignment;
   static constexpr uint64_t BaseMask = getBitmask<uint64_t>(TRAITS::Basebits);
   static constexpr uint64_t L1Mask = getBitmask<uint64_t>(TRAITS::L1bits)
                                      << TRAITS::Basebits;
@@ -411,17 +413,17 @@ template <typename T, typename TRAITS = traits> class queue final {
   using L1container = nestedcontainer<basecontainer, L1Mask>;
   using L2container = nestedcontainer<L1container, L2Mask>;
   nestedcontainer<L2container, L3Mask> container;
-  alignas(cacheline_size) char cacheline_padding1[cacheline_size];
-  alignas(cacheline_size) std::atomic<uint64_t> nodeCount; // # of allocated nodes, not the #
-                                                           // of elements stored in the queue
-  alignas(cacheline_size) char cacheline_padding2[cacheline_size];
-  alignas(cacheline_size) std::atomic<index> dequeueIx;    // dequeue pointer
-  alignas(cacheline_size) char cacheline_padding3[cacheline_size];
-  alignas(cacheline_size) std::atomic<index> enqueueIx;    // enqueue pointer
-  alignas(cacheline_size) char cacheline_padding4[cacheline_size];
-  alignas(cacheline_size) std::atomic<index> spawnIx;      // spawn pointer
-  alignas(cacheline_size) char cacheline_padding5[cacheline_size];
-  alignas(cacheline_size) std::atomic<index> recycleIx;    // recycle pointer
-  alignas(cacheline_size) char cacheline_padding6[cacheline_size];
+  alignas(cacheline_alignment) char cacheline_padding1[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<uint64_t> nodeCount; // # of allocated nodes, not the #
+                                                                // of elements stored in the queue
+  alignas(cacheline_alignment) char cacheline_padding2[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> dequeueIx;    // dequeue pointer
+  alignas(cacheline_alignment) char cacheline_padding3[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> enqueueIx;    // enqueue pointer
+  alignas(cacheline_alignment) char cacheline_padding4[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> spawnIx;      // spawn pointer
+  alignas(cacheline_alignment) char cacheline_padding5[cacheline_size];
+  alignas(cacheline_alignment) std::atomic<index> recycleIx;    // recycle pointer
+  alignas(cacheline_alignment) char cacheline_padding6[cacheline_size];
 };
-} // namespace async
+} // namespace async