From d75d8aee4d547a49e44b932397f0db4b8cef7a07 Mon Sep 17 00:00:00 2001 From: cabboose Date: Fri, 6 Sep 2024 12:30:05 +0800 Subject: [PATCH 1/7] Implement rotating idx flag --- loony/node.nim | 15 +++++++++++++-- loony/spec.nim | 19 +++++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/loony/node.nim b/loony/node.nim index 5c138a0..cd1fae3 100644 --- a/loony/node.nim +++ b/loony/node.nim @@ -69,6 +69,17 @@ else: template incEnqPathCounter*(): untyped = discard template incDeqPathCounter*(): untyped = discard +template prn*(idx: uint16): uint16 = + ## prn = 'Pro re nata' - when required + ## Provides the actual index depending on + ## if we are rotating the index or not. + when loonyRotate: + # multiply by cacheLineSize, mod by loonySlotCount + # then add idx*cacheLineSize/loonySlotCount + (idx shl lShiftBits) and (loonySlotCount - 1) or (idx shr rShiftBits) + else: + idx + template toNodePtr*(pt: uint | ptr Node): NodePtr = # Convert ptr Node into NodePtr uint cast[NodePtr](pt) @@ -105,7 +116,7 @@ proc fetchAddSlot*(t: var Node, idx: uint16, w: uint, moorder: MemoryOrder): uin ## Remembering that the pointer has 3 tail bits clear; these are ## reserved and increased atomically to indicate RESUME, READER, WRITER ## statuship. - t.slots[idx].fetchAdd(w, order = moorder) + t.slots[prn idx].fetchAdd(w, order = moorder) proc compareAndSwapNext*(t: var Node, expect: var uint, swap: uint): bool = t.next.compareExchange(expect, swap, moRelease, moRelaxed) @@ -131,7 +142,7 @@ proc allocNode*[T](pel: T): ptr Node = proc tryReclaim*(node: var Node; start: uint16) = block done: for i in start.. loonySlotCount, "Your LoonySlot count exceeds your alignment!" + when loonyRotate: + doAssert (loonySlotCount and (loonySlotCount - 1)) == 0, + "LoonySlot count must be a power of 2!" const ## Slot flag constants From 2015934630303579f0434930244cec81adfa8df6 Mon Sep 17 00:00:00 2001 From: cabboose Date: Fri, 6 Sep 2024 21:06:36 +0800 Subject: [PATCH 2/7] padd looqueue --- loony.nim | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loony.nim b/loony.nim index ea4e1f9..e637335 100644 --- a/loony.nim +++ b/loony.nim @@ -25,9 +25,9 @@ type LoonyQueue*[T] = ref LoonyQueueImpl[T] LoonyQueueImpl*[T] = object - head : Atomic[TagPtr] ## Whereby node contains the slots and idx - tail : Atomic[TagPtr] ## is the uint16 index of the slot array - currTail : Atomic[NodePtr] ## 8 bytes Current NodePtr + head {.align: 128.}: Atomic[TagPtr] ## Whereby node contains the slots and idx + tail {.align: 128.}: Atomic[TagPtr] ## is the uint16 index of the slot array + currTail {.align: 128.}: Atomic[NodePtr] ## 8 bytes Current NodePtr ## Result types for the private ## advHead and advTail functions From f0d7e51478b733530b1f5b4050addd209d2bcb84 Mon Sep 17 00:00:00 2001 From: cabboose Date: Fri, 6 Sep 2024 22:07:33 +0800 Subject: [PATCH 3/7] default rotating index --- loony/spec.nim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loony/spec.nim b/loony/spec.nim index 5993e59..865a41c 100644 --- a/loony/spec.nim +++ b/loony/spec.nim @@ -9,7 +9,7 @@ const ## owner. Note that in particular, child Continuations have cycles, ## which will trigger a failure of this assertion. - loonyRotate* {.booldefine.} = false ## Indicate that loony should rotate + loonyRotate* {.booldefine.} = true ## Indicate that loony should rotate ## the slots in the queue to avoid contention on the same cache line. ## This is useful when the queue is shared between multiple threads. ## Note that this will only work if the number of slots is a power of 2. From 0ff593bef041715311e8c2fdbd83fd4a6d757703 Mon Sep 17 00:00:00 2001 From: cabboose Date: Sat, 7 Sep 2024 10:10:25 +0800 Subject: [PATCH 4/7] export nodealignment, better compilation error messaging when loonyRotate flag is on (default) but loonySlotCount is not a power of 2 --- loony/spec.nim | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/loony/spec.nim b/loony/spec.nim index 865a41c..5251a2a 100644 --- a/loony/spec.nim +++ b/loony/spec.nim @@ -1,7 +1,7 @@ -import std/[atomics, math] +import std/[atomics, math, strformat] const - loonyNodeAlignment {.intdefine.} = 11 + loonyNodeAlignment* {.intdefine.} = 11 loonySlotCount* {.intdefine.} = 1024 loonyIsolated* {.booldefine.} = false ## Indicate that loony should @@ -15,7 +15,7 @@ const ## Note that this will only work if the number of slots is a power of 2. when loonyRotate: - # Impl dynamic cache line size detection + # TODO Impl dynamic cache line size detection const cacheLineSize = 64 lShiftBits* = int log2(float cacheLineSize) @@ -26,7 +26,9 @@ static: "Your LoonySlot count exceeds your alignment!" when loonyRotate: doAssert (loonySlotCount and (loonySlotCount - 1)) == 0, - "LoonySlot count must be a power of 2!" + fmt"Your LoonySlot count of {loonySlotCount} is not a power of 2!" & + " Either disable loonyRotate (-d:loonyRotate=false) or" & + " change the slot count." const ## Slot flag constants From 41efb690fdd552e636dfb523c6e13bdc7be7799e Mon Sep 17 00:00:00 2001 From: cabboose Date: Sat, 7 Sep 2024 10:14:55 +0800 Subject: [PATCH 5/7] added reasoning in comments for 128 byte alignment of loony queue as opposed to 64 bytes --- loony.nim | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/loony.nim b/loony.nim index e637335..0f3da71 100644 --- a/loony.nim +++ b/loony.nim @@ -28,6 +28,13 @@ type head {.align: 128.}: Atomic[TagPtr] ## Whereby node contains the slots and idx tail {.align: 128.}: Atomic[TagPtr] ## is the uint16 index of the slot array currTail {.align: 128.}: Atomic[NodePtr] ## 8 bytes Current NodePtr + # Align to 128 bytes to avoid false sharing, see: + # https://stackoverflow.com/questions/72126606/should-the-cache-padding-size-of-x86-64-be-128-bytes + # Plenty of architectural differences can impact whether + # or not 128 bytes is superior alignment to 64 bytes, but + # considering the cost that this change introduces to the + # memory consumption of the loony queue object, it is + # recommended. ## Result types for the private ## advHead and advTail functions From bcdbee0ad2364f3b010d725f06ddb2fd9303bae5 Mon Sep 17 00:00:00 2001 From: cabboose Date: Sat, 7 Sep 2024 10:34:33 +0800 Subject: [PATCH 6/7] upd vers and readme. Add compilation flags to readme. 0.3.1 --- README.md | 27 ++++++++++++++++++++++++++- loony.nimble | 2 +- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1e30a37..11840e4 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ tests and further documentation are to follow when time allows. [The full API documentation is kept up-to-date on GitHub.](https://nim-works.github.io/loony/loony.html) -[The API documentation for the Ward submodule is found here.](https://nim-works.github.io/loony/loony/ward.html) +[~~The API documentation for the Ward submodule is found here.~~](https://nim-works.github.io/loony/loony/ward.html) ~~*Wards are untested and are unlikely to remain in the library*~~ #### Memory Safety & Cache Coherence @@ -114,6 +114,19 @@ committed on the push operation and read on the pop operation; this is a higher-cost primitive. You can use `unsafePush` and `unsafePop` to manipulate a `LoonyQueue` without regard to cache coherency for ultimate performance. +The LoonyQueue itself is padded across cachelines, and by default, the slots +are read and written to in a cyclic fashion over cachelines to reduce false +sharing. + +``` +Visual representation of rotating index + +| 64 bytes | 64 bytes | 64 bytes |... +| 0------- | 1------- | 2------- |... +| -63------| -64------| -65------|... +|--127-----|--128-----|--129-----|... +``` + ### Debugging Pass `--d:loonyDebug` in compilation or with a config nimscript to use debug @@ -140,8 +153,20 @@ debugNodeCounter: We recommend against changing these values unless you know what you are doing. The suggested max alignment is 16 to achieve drastically higher contention capacities. Compilation will fail if your alignment does not fit the slot count index. `-d:loonyNodeAlignment=11` - Adjust node alignment to increase/decrease contention capacity + `-d:loonySlotCount=1024` - Adjust the number of slots in each node +`-d:loonyDebug=false` - Toggle debug counters and templates, see +[debugging](#debugging). False by default. + +`-d:loonyRotate=true` - Toggle the index for the slots of +loony queue to be read over cacheline bounds in a cyclic +manner. True by default. + +> While loonyRotate is enabled, the slot count must be a +> power of 2. Error messages will indicate whether this +> is a cause of compilation failure. + ## What are Continuations? If you've somehow missed the next big thing for nim; see [CPS](https://github.com/nim-works/cps) diff --git a/loony.nimble b/loony.nimble index 12b0429..f95723f 100644 --- a/loony.nimble +++ b/loony.nimble @@ -1,4 +1,4 @@ -version = "0.3.0" +version = "0.3.1" author = "cabboose" description = "Fast mpmc queue with sympathetic memory behavior" license = "MIT" From b9ca6539de2c1aa645ae09cf14320ed24245218d Mon Sep 17 00:00:00 2001 From: cabboose Date: Sat, 7 Sep 2024 10:36:50 +0800 Subject: [PATCH 7/7] loonyslotcount > 1 assertion --- loony/spec.nim | 2 ++ 1 file changed, 2 insertions(+) diff --git a/loony/spec.nim b/loony/spec.nim index 5251a2a..46ecbce 100644 --- a/loony/spec.nim +++ b/loony/spec.nim @@ -24,6 +24,8 @@ when loonyRotate: static: doAssert (1 shl loonyNodeAlignment) > loonySlotCount, "Your LoonySlot count exceeds your alignment!" + doAssert loonySlotCount > 1, + "Your LoonySlot count must be greater than 1!" when loonyRotate: doAssert (loonySlotCount and (loonySlotCount - 1)) == 0, fmt"Your LoonySlot count of {loonySlotCount} is not a power of 2!" &