From 306b32689635562885a3fd526d879b8825dd7b58 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Fri, 14 Feb 2020 23:29:17 -0800 Subject: [PATCH 01/19] _ --- lib/pure/collections/hashcommon.nim | 14 +++++++++++--- lib/pure/collections/tables.nim | 3 ++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index d9a914afc868c..010b860abbade 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -26,8 +26,14 @@ proc isEmpty(hcode: Hash): bool {.inline.} = proc isFilled(hcode: Hash): bool {.inline.} = result = hcode != 0 -proc nextTry(h, maxHash: Hash): Hash {.inline.} = - result = (h + 1) and maxHash + +proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = + # result = ((5*h) + 1 + perturb and maxHash) + const PERTURB_SHIFT = 5 + perturb = perturb shr PERTURB_SHIFT + result = ((5*h) + 1 + perturb) and maxHash + # echo (h, result) + # result = (h + 1) and maxHash proc mustRehash(length, counter: int): bool {.inline.} = assert(length > counter) @@ -37,14 +43,16 @@ template rawGetKnownHCImpl() {.dirty.} = if t.dataLen == 0: return -1 var h: Hash = hc and maxHash(t) # start with real hash value + var perturb = hc while isFilled(t.data[h].hcode): # Compare hc THEN key with boolean short circuit. This makes the common case # zero ==key's for missing (e.g.inserts) and exactly one ==key for present. # It does slow down succeeding lookups by one extra Hash cmp&and..usually # just a few clock cycles, generally worth it for any non-integer-like A. + # TODO: optimize this: depending on type(key), skip hc comparison if t.data[h].hcode == hc and t.data[h].key == key: return h - h = nextTry(h, maxHash(t)) + h = nextTry(h, maxHash(t), perturb) result = -1 - h # < 0 => MISSING; insert idx = -1 - result proc rawGetKnownHC[X, A](t: X, key: A, hc: Hash): int {.inline.} = diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index a0cc73ad62920..d885c120b1918 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -273,8 +273,9 @@ proc enlarge[A, B](t: var Table[A, B]) = let eh = n[i].hcode if isFilled(eh): var j: Hash = eh and maxHash(t) + var perturb = eh while isFilled(t.data[j].hcode): - j = nextTry(j, maxHash(t)) + j = nextTry(j, maxHash(t), perturb) when defined(js): rawInsert(t, t.data, n[i].key, n[i].val, eh, j) else: From d6a63dc65a96d1e1c99275f6f7357e6c75f56623 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Sat, 15 Feb 2020 20:59:32 -0800 Subject: [PATCH 02/19] _ --- lib/pure/collections/hashcommon.nim | 5 ++- lib/pure/collections/intsets.nim | 22 ++++++++----- lib/pure/collections/tableimpl.nim | 13 ++++---- lib/pure/collections/tables.nim | 49 ++++++++++++++++++++++++++--- 4 files changed, 69 insertions(+), 20 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 010b860abbade..4f253be38548b 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -28,12 +28,11 @@ proc isFilled(hcode: Hash): bool {.inline.} = proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = - # result = ((5*h) + 1 + perturb and maxHash) + # TODO: shouldn't perturb be unsigned? const PERTURB_SHIFT = 5 perturb = perturb shr PERTURB_SHIFT + # perturb = cast[uint](perturb) shr PERTURB_SHIFT # TODO result = ((5*h) + 1 + perturb) and maxHash - # echo (h, result) - # result = (h + 1) and maxHash proc mustRehash(length, counter: int): bool {.inline.} = assert(length > counter) diff --git a/lib/pure/collections/intsets.nim b/lib/pure/collections/intsets.nim index 8338ceaca9978..02ee10607c3a1 100644 --- a/lib/pure/collections/intsets.nim +++ b/lib/pure/collections/intsets.nim @@ -54,22 +54,27 @@ proc mustRehash(length, counter: int): bool {.inline.} = assert(length > counter) result = (length * 2 < counter * 3) or (length - counter < 4) -proc nextTry(h, maxHash: Hash): Hash {.inline.} = - result = ((5 * h) + 1) and maxHash +# FACTOR +proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = + const PERTURB_SHIFT = 5 + perturb = perturb shr PERTURB_SHIFT + result = ((5*h) + 1 + perturb) and maxHash proc intSetGet(t: IntSet, key: int): PTrunk = var h = key and t.max + var perturb = key while t.data[h] != nil: if t.data[h].key == key: return t.data[h] - h = nextTry(h, t.max) + h = nextTry(h, t.max, perturb) result = nil proc intSetRawInsert(t: IntSet, data: var TrunkSeq, desc: PTrunk) = var h = desc.key and t.max + var perturb = desc.key while data[h] != nil: assert(data[h] != desc) - h = nextTry(h, t.max) + h = nextTry(h, t.max, perturb) assert(data[h] == nil) data[h] = desc @@ -84,14 +89,16 @@ proc intSetEnlarge(t: var IntSet) = proc intSetPut(t: var IntSet, key: int): PTrunk = var h = key and t.max + var perturb = key while t.data[h] != nil: if t.data[h].key == key: return t.data[h] - h = nextTry(h, t.max) + h = nextTry(h, t.max, perturb) if mustRehash(t.max + 1, t.counter): intSetEnlarge(t) inc(t.counter) h = key and t.max - while t.data[h] != nil: h = nextTry(h, t.max) + perturb = key + while t.data[h] != nil: h = nextTry(h, t.max, perturb) assert(t.data[h] == nil) new(result) result.next = t.head @@ -393,7 +400,8 @@ proc assign*(dest: var IntSet, src: IntSet) = var it = src.head while it != nil: var h = it.key and dest.max - while dest.data[h] != nil: h = nextTry(h, dest.max) + var perturb = it.key + while dest.data[h] != nil: h = nextTry(h, dest.max, perturb) assert(dest.data[h] == nil) var n: PTrunk new(n) diff --git a/lib/pure/collections/tableimpl.nim b/lib/pure/collections/tableimpl.nim index 85bffd60f8def..6fb3cdc57802d 100644 --- a/lib/pure/collections/tableimpl.nim +++ b/lib/pure/collections/tableimpl.nim @@ -14,8 +14,9 @@ include hashcommon template rawGetDeepImpl() {.dirty.} = # Search algo for unconditional add genHashImpl(key, hc) var h: Hash = hc and maxHash(t) + var perturb = hc while isFilled(t.data[h].hcode): - h = nextTry(h, maxHash(t)) + h = nextTry(h, maxHash(t), perturb) result = h template rawInsertImpl() {.dirty.} = @@ -44,7 +45,6 @@ template addImpl(enlarge) {.dirty.} = inc(t.counter) template maybeRehashPutImpl(enlarge) {.dirty.} = - checkIfInitialized() if mustRehash(t.dataLen, t.counter): enlarge(t) index = rawGetKnownHC(t, key, hc) @@ -85,21 +85,22 @@ template delImplIdx(t, i) = block outer: while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1 var j = i # The correctness of this depends on (h+1) in nextTry, - var r = j # though may be adaptable to other simple sequences. + # though may be adaptable to other simple sequences. t.data[i].hcode = 0 # mark current EMPTY t.data[i].key = default(type(t.data[i].key)) t.data[i].val = default(type(t.data[i].val)) while true: - i = (i + 1) and msk # increment mod table size + i = (i + 1) and msk # increment mod table size; PRTEMP if isEmpty(t.data[i].hcode): # end of collision cluster; So all done break outer - r = t.data[i].hcode and msk # "home" location of key@i + var r = t.data[i].hcode and msk # "home" location of key@i if not ((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)): break + # TODO: can we instead just move the last in the chain? when defined(js): t.data[j] = t.data[i] else: - t.data[j] = move(t.data[i]) # data[j] will be marked EMPTY next loop + t.data[j] = move(t.data[i]) # data[i] will be marked EMPTY next loop template delImpl() {.dirty.} = var hc: Hash diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index d885c120b1918..4b0980f172fa6 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -761,6 +761,38 @@ iterator mvalues*[A, B](t: var Table[A, B]): var B = yield t.data[h].val assert(len(t) == L, "the length of the table changed while iterating over it") +import intsets + +template inCache(): bool = + # TODO: instead modify how add works for table w multiple identical keys, eg usign instead hash((key, keyCount)) ? + if num <= N: + # short cache optimization: linear search, to avoid overhead of IntSet for common case + var found = false + for i in 0.. Date: Sun, 16 Feb 2020 00:28:46 -0800 Subject: [PATCH 03/19] _ --- lib/pure/collections/hashcommon.nim | 59 ++++++++++++++++------ lib/pure/collections/sharedtables.nim | 2 +- lib/pure/collections/tableimpl.nim | 70 ++++++++++++++++----------- lib/pure/collections/tables.nim | 20 ++++---- 4 files changed, 99 insertions(+), 52 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 4f253be38548b..f95eff3802175 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -20,8 +20,15 @@ when not defined(nimHasDefault): # hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These # two procs retain clarity of that encoding without the space cost of an enum. -proc isEmpty(hcode: Hash): bool {.inline.} = - result = hcode == 0 +when false: + proc isEmpty(hcode: Hash): bool {.inline.} = + result = hcode == 0 + +const freeMarker = 0 +const deletedMarker = -1 + +proc isFilledValid(hcode: Hash): bool {.inline.} = + result = hcode != 0 and hcode != deletedMarker # SPEED: could improve w bit magic proc isFilled(hcode: Hash): bool {.inline.} = result = hcode != 0 @@ -30,29 +37,49 @@ proc isFilled(hcode: Hash): bool {.inline.} = proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = # TODO: shouldn't perturb be unsigned? const PERTURB_SHIFT = 5 - perturb = perturb shr PERTURB_SHIFT - # perturb = cast[uint](perturb) shr PERTURB_SHIFT # TODO + # SPEED IMPROVE + var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT + perturb = cast[Hash](perturb2) result = ((5*h) + 1 + perturb) and maxHash proc mustRehash(length, counter: int): bool {.inline.} = assert(length > counter) - result = (length * 2 < counter * 3) or (length - counter < 4) + result = (length * 2 < counter * 3) or (length - counter < 4) # synchronize with `rightSize` + +proc mustRehash2[T](t: T): bool {.inline.} = + let counter2 = t.counter + t.countDeleted + # echo (t.dataLen, t.counter, t.countDeleted, counter2) + result = mustRehash(t.dataLen, counter2) + # echo ("mustRehash2", t.dataLen, t.counter, t.countDeleted, counter2, result) template rawGetKnownHCImpl() {.dirty.} = if t.dataLen == 0: return -1 var h: Hash = hc and maxHash(t) # start with real hash value var perturb = hc - while isFilled(t.data[h].hcode): - # Compare hc THEN key with boolean short circuit. This makes the common case - # zero ==key's for missing (e.g.inserts) and exactly one ==key for present. - # It does slow down succeeding lookups by one extra Hash cmp&and..usually - # just a few clock cycles, generally worth it for any non-integer-like A. - # TODO: optimize this: depending on type(key), skip hc comparison - if t.data[h].hcode == hc and t.data[h].key == key: - return h - h = nextTry(h, maxHash(t), perturb) - result = -1 - h # < 0 => MISSING; insert idx = -1 - result + var deletedIndex = -1 + while true: + if isFilledValid(t.data[h].hcode): + # Compare hc THEN key with boolean short circuit. This makes the common case + # zero ==key's for missing (e.g.inserts) and exactly one ==key for present. + # It does slow down succeeding lookups by one extra Hash cmp&and..usually + # just a few clock cycles, generally worth it for any non-integer-like A. + # TODO: optimize this: depending on type(key), skip hc comparison + if t.data[h].hcode == hc and t.data[h].key == key: + return h + # echo (h, perturb, maxHash(t)) + h = nextTry(h, maxHash(t), perturb) + elif t.data[h].hcode == deletedMarker: + if deletedIndex == -1: + deletedIndex = h + h = nextTry(h, maxHash(t), perturb) + else: + break + if deletedIndex == -1: + result = -1 - h # < 0 => MISSING; insert idx = -1 - result # TODO + else: + # we prefer returning a (in fact the 1st found) deleted index + result = -1 - deletedIndex proc rawGetKnownHC[X, A](t: X, key: A, hc: Hash): int {.inline.} = rawGetKnownHCImpl() @@ -61,6 +88,8 @@ template genHashImpl(key, hc: typed) = hc = hash(key) if hc == 0: # This almost never taken branch should be very predictable. hc = 314159265 # Value doesn't matter; Any non-zero favorite is fine. + elif hc == deletedMarker: + hc = 214159261 template genHash(key: typed): Hash = var res: Hash diff --git a/lib/pure/collections/sharedtables.nim b/lib/pure/collections/sharedtables.nim index 1a614875284d8..ae623790bfa0a 100644 --- a/lib/pure/collections/sharedtables.nim +++ b/lib/pure/collections/sharedtables.nim @@ -32,7 +32,7 @@ template maxHash(t): untyped = t.dataLen-1 include tableimpl template st_maybeRehashPutImpl(enlarge) {.dirty.} = - if mustRehash(t.dataLen, t.counter): + if mustRehash2(t): enlarge(t) index = rawGetKnownHC(t, key, hc) index = -1 - index # important to transform for mgetOrPutImpl diff --git a/lib/pure/collections/tableimpl.nim b/lib/pure/collections/tableimpl.nim index 6fb3cdc57802d..558999720343a 100644 --- a/lib/pure/collections/tableimpl.nim +++ b/lib/pure/collections/tableimpl.nim @@ -15,13 +15,19 @@ template rawGetDeepImpl() {.dirty.} = # Search algo for unconditional add genHashImpl(key, hc) var h: Hash = hc and maxHash(t) var perturb = hc - while isFilled(t.data[h].hcode): - h = nextTry(h, maxHash(t), perturb) + while true: + let hcode = t.data[h].hcode + if hcode == deletedMarker or hcode == freeMarker: + break + else: + h = nextTry(h, maxHash(t), perturb) result = h -template rawInsertImpl() {.dirty.} = +template rawInsertImpl(t) {.dirty.} = data[h].key = key data[h].val = val + if data[h].hcode == deletedMarker: + t.countDeleted.dec data[h].hcode = hc proc rawGetDeep[X, A](t: X, key: A, hc: var Hash): int {.inline.} = @@ -29,7 +35,7 @@ proc rawGetDeep[X, A](t: X, key: A, hc: var Hash): int {.inline.} = proc rawInsert[X, A, B](t: var X, data: var KeyValuePairSeq[A, B], key: A, val: B, hc: Hash, h: Hash) = - rawInsertImpl() + rawInsertImpl(t) template checkIfInitialized() = when compiles(defaultInitialSize): @@ -38,14 +44,14 @@ template checkIfInitialized() = template addImpl(enlarge) {.dirty.} = checkIfInitialized() - if mustRehash(t.dataLen, t.counter): enlarge(t) + if mustRehash2(t): enlarge(t) var hc: Hash var j = rawGetDeep(t, key, hc) rawInsert(t, t.data, key, val, hc, j) inc(t.counter) template maybeRehashPutImpl(enlarge) {.dirty.} = - if mustRehash(t.dataLen, t.counter): + if mustRehash2(t): enlarge(t) index = rawGetKnownHC(t, key, hc) index = -1 - index # important to transform for mgetOrPutImpl @@ -82,25 +88,35 @@ template delImplIdx(t, i) = let msk = maxHash(t) if i >= 0: dec(t.counter) - block outer: - while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1 - var j = i # The correctness of this depends on (h+1) in nextTry, - # though may be adaptable to other simple sequences. - t.data[i].hcode = 0 # mark current EMPTY - t.data[i].key = default(type(t.data[i].key)) - t.data[i].val = default(type(t.data[i].val)) - while true: - i = (i + 1) and msk # increment mod table size; PRTEMP - if isEmpty(t.data[i].hcode): # end of collision cluster; So all done - break outer - var r = t.data[i].hcode and msk # "home" location of key@i - if not ((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)): - break - # TODO: can we instead just move the last in the chain? - when defined(js): - t.data[j] = t.data[i] - else: - t.data[j] = move(t.data[i]) # data[i] will be marked EMPTY next loop + inc(t.countDeleted) + when true: + t.data[i].hcode = deletedMarker + t.data[i].key = default(type(t.data[i].key)) + t.data[i].val = default(type(t.data[i].val)) + # maybe not needed because counter+countDeleted doesn't change, unless formula is not a simple + + # if mustRehash2(t): + # enlarge(t) # RENAME enlarge... + # i = rawGetKnownHC(t, key, hc) # TODO + else: + block outer: + while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1 + var j = i # The correctness of this depends on (h+1) in nextTry, + # though may be adaptable to other simple sequences. + t.data[i].hcode = 0 # mark current EMPTY + t.data[i].key = default(type(t.data[i].key)) + t.data[i].val = default(type(t.data[i].val)) + while true: + i = (i + 1) and msk # increment mod table size + if isEmpty(t.data[i].hcode): # end of collision cluster; So all done + break outer + var r = t.data[i].hcode and msk # "home" location of key@i + if not ((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)): + break + # TODO: can we instead just move the last in the chain? + when defined(js): + t.data[j] = t.data[i] + else: + t.data[j] = move(t.data[i]) # data[i] will be marked EMPTY next loop template delImpl() {.dirty.} = var hc: Hash @@ -124,8 +140,8 @@ template initImpl(result: typed, size: int) = result.last = -1 template insertImpl() = # for CountTable - if t.dataLen == 0: initImpl(t, defaultInitialSize) - if mustRehash(len(t.data), t.counter): enlarge(t) + checkIfInitialized() + if mustRehash2(t): enlarge(t) ctRawInsert(t, t.data, key, val) inc(t.counter) diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index 4b0980f172fa6..00696ebda54a0 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -233,6 +233,7 @@ type ## For creating an empty Table, use `initTable proc<#initTable,int>`_. data: KeyValuePairSeq[A, B] counter: int + countDeleted: int TableRef*[A, B] = ref Table[A, B] ## Ref version of `Table<#Table>`_. ## ## For creating a new empty TableRef, use `newTable proc @@ -267,11 +268,12 @@ template get(t, key): untyped = proc enlarge[A, B](t: var Table[A, B]) = var n: KeyValuePairSeq[A, B] - newSeq(n, len(t.data) * growthFactor) + newSeq(n, t.counter.rightSize) swap(t.data, n) + t.countDeleted = 0 for i in countup(0, high(n)): let eh = n[i].hcode - if isFilled(eh): + if isFilledValid(eh): var j: Hash = eh and maxHash(t) var perturb = eh while isFilled(t.data[j].hcode): @@ -671,7 +673,7 @@ iterator pairs*[A, B](t: Table[A, B]): (A, B) = ## # value: [1, 5, 7, 9] let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledValid(t.data[h].hcode): yield (t.data[h].key, t.data[h].val) assert(len(t) == L, "the length of the table changed while iterating over it") @@ -693,7 +695,7 @@ iterator mpairs*[A, B](t: var Table[A, B]): (A, var B) = let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledValid(t.data[h].hcode): yield (t.data[h].key, t.data[h].val) assert(len(t) == L, "the length of the table changed while iterating over it") @@ -714,7 +716,7 @@ iterator keys*[A, B](t: Table[A, B]): A = let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledValid(t.data[h].hcode): yield t.data[h].key assert(len(t) == L, "the length of the table changed while iterating over it") @@ -735,7 +737,7 @@ iterator values*[A, B](t: Table[A, B]): B = let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledValid(t.data[h].hcode): yield t.data[h].val assert(len(t) == L, "the length of the table changed while iterating over it") @@ -757,7 +759,7 @@ iterator mvalues*[A, B](t: var Table[A, B]): var B = let L = len(t) for h in 0 .. high(t.data): - if isFilled(t.data[h].hcode): + if isFilledValid(t.data[h].hcode): yield t.data[h].val assert(len(t) == L, "the length of the table changed while iterating over it") @@ -822,7 +824,7 @@ iterator allValues*[A, B](t: Table[A, B]; key: A): B = # const N = 20000 # can optimize this var cache1 {.noinit.}: array[N, Hash] var cache2: IntSet - while isFilled(t.data[h].hcode): + while isFilled(t.data[h].hcode): # CHECKME isFilledValid ?? if t.data[h].key == key: if not inCache(): yield t.data[h].val @@ -1282,7 +1284,7 @@ proc rawGet[A, B](t: OrderedTable[A, B], key: A, hc: var Hash): int = proc rawInsert[A, B](t: var OrderedTable[A, B], data: var OrderedKeyValuePairSeq[A, B], key: A, val: B, hc: Hash, h: Hash) = - rawInsertImpl() + rawInsertImpl(t) data[h].next = -1 if t.first < 0: t.first = h if t.last >= 0: data[t.last].next = h From 119076ac38850c7513c90ec58d65008555e1c123 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Sun, 16 Feb 2020 01:25:03 -0800 Subject: [PATCH 04/19] _ --- lib/pure/hashes.nim | 47 +++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index ac8498517bc65..3f805c937048a 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -112,38 +112,31 @@ proc hash*[T: proc](x: T): Hash {.inline.} = else: result = hash(pointer(x)) -const - prime = uint(11) +proc hashUInt32*(x: uint32): Hash {.inline.} = + ## for internal use; user code should prefer `hash` overloads + # calling `hashUInt64(x)` would perform 1.736 worse, see thashes_perf toInt32 + when nimvm: # in vmops + doAssert false + else: + # inspired from https://gist.github.com/badboy/6267743 + var x = x xor ((x shr 20) xor (x shr 12)) + result = cast[Hash](x xor (x shr 7) xor (x shr 4)) -proc hash*(x: int): Hash {.inline.} = +const prime = uint(11) +proc hash*(x: int|int64|uint|uint64|char|Ordinal): Hash {.inline.} = ## Efficient hashing of integers. - result = cast[Hash](cast[uint](x) * prime) - -proc hash*(x: int64): Hash {.inline.} = - ## Efficient hashing of `int64` integers. - result = cast[Hash](cast[uint](x) * prime) - -proc hash*(x: uint): Hash {.inline.} = - ## Efficient hashing of unsigned integers. - result = cast[Hash](x * prime) - -proc hash*(x: uint64): Hash {.inline.} = - ## Efficient hashing of `uint64` integers. - result = cast[Hash](cast[uint](x) * prime) - -proc hash*(x: char): Hash {.inline.} = - ## Efficient hashing of characters. - result = cast[Hash](cast[uint](ord(x)) * prime) - -proc hash*[T: Ordinal](x: T): Hash {.inline.} = - ## Efficient hashing of other ordinal types (e.g. enums). - result = cast[Hash](cast[uint](ord(x)) * prime) + when sizeof(x) < 4: + cast[Hash](x) + elif sizeof(x) == 4: + cast[Hash](cast[uint](x) * prime) + # hashUInt32(x) + else: + cast[Hash](cast[uint](x) * prime) proc hash*(x: float): Hash {.inline.} = ## Efficient hashing of floats. - var y = x + 1.0 - result = cast[ptr Hash](addr(y))[] - + var y = x + 0.0 + result = hash(cast[ptr Hash](addr(y))[]) # Forward declarations before methods that hash containers. This allows # containers to contain other containers From 5458f10a903df7b34f6bd7685852dd1658c92426 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Sun, 16 Feb 2020 02:18:05 -0800 Subject: [PATCH 05/19] fixup --- lib/pure/collections/hashcommon.nim | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index f95eff3802175..ca19fcc9cb3b6 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -35,9 +35,8 @@ proc isFilled(hcode: Hash): bool {.inline.} = proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = - # TODO: shouldn't perturb be unsigned? const PERTURB_SHIFT = 5 - # SPEED IMPROVE + # TODO: make perturb (maybe even Hash) unsigned everywhere to avoid back and forth conversions var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT perturb = cast[Hash](perturb2) result = ((5*h) + 1 + perturb) and maxHash @@ -48,9 +47,7 @@ proc mustRehash(length, counter: int): bool {.inline.} = proc mustRehash2[T](t: T): bool {.inline.} = let counter2 = t.counter + t.countDeleted - # echo (t.dataLen, t.counter, t.countDeleted, counter2) result = mustRehash(t.dataLen, counter2) - # echo ("mustRehash2", t.dataLen, t.counter, t.countDeleted, counter2, result) template rawGetKnownHCImpl() {.dirty.} = if t.dataLen == 0: @@ -76,7 +73,7 @@ template rawGetKnownHCImpl() {.dirty.} = else: break if deletedIndex == -1: - result = -1 - h # < 0 => MISSING; insert idx = -1 - result # TODO + result = -1 - h # < 0 => MISSING; insert idx = -1 - result else: # we prefer returning a (in fact the 1st found) deleted index result = -1 - deletedIndex From 36abf0e6ef1962dc85da2a38749da6551dffcf8b Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Sun, 16 Feb 2020 02:35:55 -0800 Subject: [PATCH 06/19] nim compiles --- lib/pure/collections/hashcommon.nim | 2 +- lib/pure/collections/setimpl.nim | 19 +++------------- lib/pure/collections/sets.nim | 2 ++ lib/pure/collections/tableimpl.nim | 35 ++++++----------------------- lib/pure/collections/tables.nim | 23 ++++++++++++------- 5 files changed, 28 insertions(+), 53 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index ca19fcc9cb3b6..20e92df556a96 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -46,6 +46,7 @@ proc mustRehash(length, counter: int): bool {.inline.} = result = (length * 2 < counter * 3) or (length - counter < 4) # synchronize with `rightSize` proc mustRehash2[T](t: T): bool {.inline.} = + # static: echo $T let counter2 = t.counter + t.countDeleted result = mustRehash(t.dataLen, counter2) @@ -64,7 +65,6 @@ template rawGetKnownHCImpl() {.dirty.} = # TODO: optimize this: depending on type(key), skip hc comparison if t.data[h].hcode == hc and t.data[h].key == key: return h - # echo (h, perturb, maxHash(t)) h = nextTry(h, maxHash(t), perturb) elif t.data[h].hcode == deletedMarker: if deletedIndex == -1: diff --git a/lib/pure/collections/setimpl.nim b/lib/pure/collections/setimpl.nim index d2d1490ff6fb4..e372f3d4c1e6d 100644 --- a/lib/pure/collections/setimpl.nim +++ b/lib/pure/collections/setimpl.nim @@ -68,11 +68,6 @@ template containsOrInclImpl() {.dirty.} = rawInsert(s, s.data, key, hc, -1 - index) inc(s.counter) -template doWhile(a, b) = - while true: - b - if not a: break - proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} = var hc: Hash var i = rawGet(s, key, hc) @@ -82,17 +77,9 @@ proc exclImpl[A](s: var HashSet[A], key: A): bool {.inline.} = if i >= 0: result = false dec(s.counter) - while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1 - var j = i # The correctness of this depends on (h+1) in nextTry, - var r = j # though may be adaptable to other simple sequences. - s.data[i].hcode = 0 # mark current EMPTY - s.data[i].key = default(type(s.data[i].key)) - doWhile((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)): - i = (i + 1) and msk # increment mod table size - if isEmpty(s.data[i].hcode): # end of collision cluster; So all done - return - r = s.data[i].hcode and msk # "home" location of key@i - s.data[j] = move(s.data[i]) # data[i] will be marked EMPTY next loop + inc(s.countDeleted) + s.data[i].hcode = deletedMarker + s.data[i].key = default(type(s.data[i].key)) template dollarImpl() {.dirty.} = result = "{" diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index 9dd72cb565971..f27c955ee15df 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -68,6 +68,7 @@ type ## before calling other procs on it. data: KeyValuePairSeq[A] counter: int + countDeleted: int type OrderedKeyValuePair[A] = tuple[ @@ -80,6 +81,7 @@ type ## <#initOrderedSet,int>`_ before calling other procs on it. data: OrderedKeyValuePairSeq[A] counter, first, last: int + countDeleted: int const defaultInitialSize* = 64 diff --git a/lib/pure/collections/tableimpl.nim b/lib/pure/collections/tableimpl.nim index 558999720343a..d048a2f20a099 100644 --- a/lib/pure/collections/tableimpl.nim +++ b/lib/pure/collections/tableimpl.nim @@ -89,34 +89,13 @@ template delImplIdx(t, i) = if i >= 0: dec(t.counter) inc(t.countDeleted) - when true: - t.data[i].hcode = deletedMarker - t.data[i].key = default(type(t.data[i].key)) - t.data[i].val = default(type(t.data[i].val)) - # maybe not needed because counter+countDeleted doesn't change, unless formula is not a simple + - # if mustRehash2(t): - # enlarge(t) # RENAME enlarge... - # i = rawGetKnownHC(t, key, hc) # TODO - else: - block outer: - while true: # KnuthV3 Algo6.4R adapted for i=i+1 instead of i=i-1 - var j = i # The correctness of this depends on (h+1) in nextTry, - # though may be adaptable to other simple sequences. - t.data[i].hcode = 0 # mark current EMPTY - t.data[i].key = default(type(t.data[i].key)) - t.data[i].val = default(type(t.data[i].val)) - while true: - i = (i + 1) and msk # increment mod table size - if isEmpty(t.data[i].hcode): # end of collision cluster; So all done - break outer - var r = t.data[i].hcode and msk # "home" location of key@i - if not ((i >= r and r > j) or (r > j and j > i) or (j > i and i >= r)): - break - # TODO: can we instead just move the last in the chain? - when defined(js): - t.data[j] = t.data[i] - else: - t.data[j] = move(t.data[i]) # data[i] will be marked EMPTY next loop + t.data[i].hcode = deletedMarker + t.data[i].key = default(type(t.data[i].key)) + t.data[i].val = default(type(t.data[i].val)) + # maybe not needed because counter+countDeleted doesn't change, unless formula is not a simple + + # if mustRehash2(t): + # enlarge(t) # RENAME enlarge... + # i = rawGetKnownHC(t, key, hc) # TODO template delImpl() {.dirty.} = var hc: Hash diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index 00696ebda54a0..07274722c5659 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -1263,6 +1263,8 @@ type ## <#initOrderedTable,int>`_. data: OrderedKeyValuePairSeq[A, B] counter, first, last: int + countDeleted: int + OrderedTableRef*[A, B] = ref OrderedTable[A, B] ## Ref version of ## `OrderedTable<#OrderedTable>`_. ## @@ -1292,18 +1294,20 @@ proc rawInsert[A, B](t: var OrderedTable[A, B], proc enlarge[A, B](t: var OrderedTable[A, B]) = var n: OrderedKeyValuePairSeq[A, B] - newSeq(n, len(t.data) * growthFactor) + newSeq(n, t.counter.rightSize) var h = t.first t.first = -1 t.last = -1 swap(t.data, n) + t.countDeleted = 0 while h >= 0: var nxt = n[h].next let eh = n[h].hcode - if isFilled(eh): + if isFilledValid(eh): var j: Hash = eh and maxHash(t) + var perturb = eh while isFilled(t.data[j].hcode): - j = nextTry(j, maxHash(t)) + j = nextTry(j, maxHash(t), perturb) rawInsert(t, t.data, n[h].key, n[h].val, n[h].hcode, j) h = nxt @@ -2255,6 +2259,7 @@ type ## <#initCountTable,int>`_. data: seq[tuple[key: A, val: int]] counter: int + countDeleted: int isSorted: bool CountTableRef*[A] = ref CountTable[A] ## Ref version of ## `CountTable<#CountTable>`_. @@ -2267,8 +2272,9 @@ type proc ctRawInsert[A](t: CountTable[A], data: var seq[tuple[key: A, val: int]], key: A, val: int) = - var h: Hash = hash(key) and high(data) - while data[h].val != 0: h = nextTry(h, high(data)) + var perturb = hash(key) + var h: Hash = perturb and high(data) + while data[h].val != 0: h = nextTry(h, high(data), perturb) # TODO: hadnle deletedMarker data[h].key = key data[h].val = val @@ -2295,10 +2301,11 @@ proc remove[A](t: var CountTable[A], key: A) = proc rawGet[A](t: CountTable[A], key: A): int = if t.data.len == 0: return -1 - var h: Hash = hash(key) and high(t.data) # start with real hash value - while t.data[h].val != 0: + var perturb = hash(key) + var h: Hash = perturb and high(t.data) # start with real hash value + while t.data[h].val != 0: # TODO: may need to handle t.data[h].hcode == deletedMarker? if t.data[h].key == key: return h - h = nextTry(h, high(t.data)) + h = nextTry(h, high(t.data), perturb) result = -1 - h # < 0 => MISSING; insert idx = -1 - result template ctget(t, key, default: untyped): untyped = From 2e7b98de06287ab4d0492689ffdcb833dc778981 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Tue, 18 Feb 2020 11:35:52 -0800 Subject: [PATCH 07/19] address comments --- lib/pure/collections/hashcommon.nim | 4 ++-- lib/pure/collections/tables.nim | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 20e92df556a96..8eac7c81c138f 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -27,7 +27,7 @@ when false: const freeMarker = 0 const deletedMarker = -1 -proc isFilledValid(hcode: Hash): bool {.inline.} = +proc isFilledAndValid(hcode: Hash): bool {.inline.} = result = hcode != 0 and hcode != deletedMarker # SPEED: could improve w bit magic proc isFilled(hcode: Hash): bool {.inline.} = @@ -57,7 +57,7 @@ template rawGetKnownHCImpl() {.dirty.} = var perturb = hc var deletedIndex = -1 while true: - if isFilledValid(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): # Compare hc THEN key with boolean short circuit. This makes the common case # zero ==key's for missing (e.g.inserts) and exactly one ==key for present. # It does slow down succeeding lookups by one extra Hash cmp&and..usually diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index 07274722c5659..031e5c3efbae9 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -273,7 +273,7 @@ proc enlarge[A, B](t: var Table[A, B]) = t.countDeleted = 0 for i in countup(0, high(n)): let eh = n[i].hcode - if isFilledValid(eh): + if isFilledAndValid(eh): var j: Hash = eh and maxHash(t) var perturb = eh while isFilled(t.data[j].hcode): @@ -673,7 +673,7 @@ iterator pairs*[A, B](t: Table[A, B]): (A, B) = ## # value: [1, 5, 7, 9] let L = len(t) for h in 0 .. high(t.data): - if isFilledValid(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield (t.data[h].key, t.data[h].val) assert(len(t) == L, "the length of the table changed while iterating over it") @@ -695,7 +695,7 @@ iterator mpairs*[A, B](t: var Table[A, B]): (A, var B) = let L = len(t) for h in 0 .. high(t.data): - if isFilledValid(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield (t.data[h].key, t.data[h].val) assert(len(t) == L, "the length of the table changed while iterating over it") @@ -716,7 +716,7 @@ iterator keys*[A, B](t: Table[A, B]): A = let L = len(t) for h in 0 .. high(t.data): - if isFilledValid(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield t.data[h].key assert(len(t) == L, "the length of the table changed while iterating over it") @@ -737,7 +737,7 @@ iterator values*[A, B](t: Table[A, B]): B = let L = len(t) for h in 0 .. high(t.data): - if isFilledValid(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield t.data[h].val assert(len(t) == L, "the length of the table changed while iterating over it") @@ -759,7 +759,7 @@ iterator mvalues*[A, B](t: var Table[A, B]): var B = let L = len(t) for h in 0 .. high(t.data): - if isFilledValid(t.data[h].hcode): + if isFilledAndValid(t.data[h].hcode): yield t.data[h].val assert(len(t) == L, "the length of the table changed while iterating over it") @@ -824,7 +824,7 @@ iterator allValues*[A, B](t: Table[A, B]; key: A): B = # const N = 20000 # can optimize this var cache1 {.noinit.}: array[N, Hash] var cache2: IntSet - while isFilled(t.data[h].hcode): # CHECKME isFilledValid ?? + while isFilled(t.data[h].hcode): # CHECKME isFilledAndValid ?? if t.data[h].key == key: if not inCache(): yield t.data[h].val @@ -1303,7 +1303,7 @@ proc enlarge[A, B](t: var OrderedTable[A, B]) = while h >= 0: var nxt = n[h].next let eh = n[h].hcode - if isFilledValid(eh): + if isFilledAndValid(eh): var j: Hash = eh and maxHash(t) var perturb = eh while isFilled(t.data[j].hcode): From bc86a8024e5f0b1bbe480c2f20fc4f3b67e2a0a9 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Tue, 18 Feb 2020 11:47:31 -0800 Subject: [PATCH 08/19] _ --- lib/pure/collections/intsets.nim | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/pure/collections/intsets.nim b/lib/pure/collections/intsets.nim index 02ee10607c3a1..6b36553a967b2 100644 --- a/lib/pure/collections/intsets.nim +++ b/lib/pure/collections/intsets.nim @@ -51,13 +51,15 @@ type a: array[0..33, int] # profiling shows that 34 elements are enough proc mustRehash(length, counter: int): bool {.inline.} = + # FACTOR with hashcommon.mustRehash assert(length > counter) result = (length * 2 < counter * 3) or (length - counter < 4) -# FACTOR proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = + # FACTOR with hashcommon.nextTry const PERTURB_SHIFT = 5 - perturb = perturb shr PERTURB_SHIFT + var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT + perturb = cast[Hash](perturb2) result = ((5*h) + 1 + perturb) and maxHash proc intSetGet(t: IntSet, key: int): PTrunk = From 86108c091782e63d6bbee43d1b327789070f2f57 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Tue, 18 Feb 2020 11:49:57 -0800 Subject: [PATCH 09/19] fixup --- lib/pure/collections/hashcommon.nim | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 8eac7c81c138f..905488af92138 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -18,15 +18,11 @@ when not defined(nimHasDefault): var v: T v -# hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These -# two procs retain clarity of that encoding without the space cost of an enum. -when false: - proc isEmpty(hcode: Hash): bool {.inline.} = - result = hcode == 0 - const freeMarker = 0 const deletedMarker = -1 +# hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These +# two procs retain clarity of that encoding without the space cost of an enum. proc isFilledAndValid(hcode: Hash): bool {.inline.} = result = hcode != 0 and hcode != deletedMarker # SPEED: could improve w bit magic From e08bd320e34ef5f22c597d1fd4b57faf7a3b89b7 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Tue, 18 Feb 2020 14:24:36 -0800 Subject: [PATCH 10/19] fix tests --- lib/pure/collections/hashcommon.nim | 1 - lib/pure/collections/intsets.nim | 9 ++++- lib/pure/collections/sets.nim | 8 ++-- lib/pure/collections/sharedtables.nim | 4 +- lib/pure/collections/tables.nim | 54 +++++++++------------------ lib/pure/testutils.nim | 8 ++++ tests/arc/trepr.nim | 2 +- tests/collections/ttables.nim | 13 +++++-- tests/collections/ttablesthreads.nim | 5 +-- 9 files changed, 53 insertions(+), 51 deletions(-) create mode 100644 lib/pure/testutils.nim diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 905488af92138..d7b0e73e397de 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -42,7 +42,6 @@ proc mustRehash(length, counter: int): bool {.inline.} = result = (length * 2 < counter * 3) or (length - counter < 4) # synchronize with `rightSize` proc mustRehash2[T](t: T): bool {.inline.} = - # static: echo $T let counter2 = t.counter + t.countDeleted result = mustRehash(t.dataLen, counter2) diff --git a/lib/pure/collections/intsets.nim b/lib/pure/collections/intsets.nim index 6b36553a967b2..f6aa2b0c27a65 100644 --- a/lib/pure/collections/intsets.nim +++ b/lib/pure/collections/intsets.nim @@ -46,6 +46,7 @@ type IntSet* = object ## An efficient set of `int` implemented as a sparse bit set. elems: int # only valid for small numbers counter, max: int + countDeleted: int head: PTrunk data: TrunkSeq a: array[0..33, int] # profiling shows that 34 elements are enough @@ -55,6 +56,11 @@ proc mustRehash(length, counter: int): bool {.inline.} = assert(length > counter) result = (length * 2 < counter * 3) or (length - counter < 4) +proc mustRehash2[T](t: T): bool {.inline.} = + # FACTOR with hashcommon.mustRehash2 + let counter2 = t.counter + t.countDeleted + result = mustRehash(t.max + 1, counter2) + proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = # FACTOR with hashcommon.nextTry const PERTURB_SHIFT = 5 @@ -96,7 +102,7 @@ proc intSetPut(t: var IntSet, key: int): PTrunk = if t.data[h].key == key: return t.data[h] h = nextTry(h, t.max, perturb) - if mustRehash(t.max + 1, t.counter): intSetEnlarge(t) + if mustRehash2(t): intSetEnlarge(t) inc(t.counter) h = key and t.max perturb = key @@ -109,6 +115,7 @@ proc intSetPut(t: var IntSet, key: int): PTrunk = t.data[h] = result proc bitincl(s: var IntSet, key: int) {.inline.} = + var ret: PTrunk var t = intSetPut(s, `shr`(key, TrunkShift)) var u = key and TrunkMask t.bits[u shr IntShift] = t.bits[u shr IntShift] or diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index f27c955ee15df..6edcd65abf282 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -252,7 +252,7 @@ iterator items*[A](s: HashSet[A]): A = ## echo b ## # --> {(a: 1, b: 3), (a: 0, b: 4)} for h in 0 .. high(s.data): - if isFilled(s.data[h].hcode): yield s.data[h].key + if isFilledAndValid(s.data[h].hcode): yield s.data[h].key proc containsOrIncl*[A](s: var HashSet[A], key: A): bool = ## Includes `key` in the set `s` and tells if `key` was already in `s`. @@ -344,7 +344,7 @@ proc pop*[A](s: var HashSet[A]): A = doAssertRaises(KeyError, echo s.pop) for h in 0 .. high(s.data): - if isFilled(s.data[h].hcode): + if isFilledAndValid(s.data[h].hcode): result = s.data[h].key excl(s, result) return result @@ -636,7 +636,7 @@ template forAllOrderedPairs(yieldStmt: untyped) {.dirty.} = var idx = 0 while h >= 0: var nxt = s.data[h].next - if isFilled(s.data[h].hcode): + if isFilledAndValid(s.data[h].hcode): yieldStmt inc(idx) h = nxt @@ -870,7 +870,7 @@ proc `==`*[A](s, t: OrderedSet[A]): bool = while h >= 0 and g >= 0: var nxh = s.data[h].next var nxg = t.data[g].next - if isFilled(s.data[h].hcode) and isFilled(t.data[g].hcode): + if isFilledAndValid(s.data[h].hcode) and isFilledAndValid(t.data[g].hcode): if s.data[h].key == t.data[g].key: inc compared else: diff --git a/lib/pure/collections/sharedtables.nim b/lib/pure/collections/sharedtables.nim index ae623790bfa0a..2a48e60f03559 100644 --- a/lib/pure/collections/sharedtables.nim +++ b/lib/pure/collections/sharedtables.nim @@ -25,6 +25,7 @@ type SharedTable*[A, B] = object ## generic hash SharedTable data: KeyValuePairSeq[A, B] counter, dataLen: int + countDeleted: int lock: Lock template maxHash(t): untyped = t.dataLen-1 @@ -49,9 +50,10 @@ proc enlarge[A, B](t: var SharedTable[A, B]) = for i in 0.. Date: Tue, 18 Feb 2020 15:30:48 -0800 Subject: [PATCH 11/19] fix allValues --- lib/pure/collections/tables.nim | 24 +++++++++--------------- lib/pure/testutils.nim | 5 +++++ tests/collections/ttables.nim | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index b71ce90b1e188..29340dc9e8f6d 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -786,19 +786,13 @@ iterator allValues*[A, B](t: Table[A, B]; key: A): B = ## Used if you have a table with duplicate keys (as a result of using ## `add proc<#add,Table[A,B],A,B>`_). ## - ## **Examples:** - ## - ## .. code-block:: - ## var a = {'a': 3, 'b': 5}.toTable - ## for i in 1..3: - ## a.add('z', 10*i) - ## echo a # {'a': 3, 'b': 5, 'z': 10, 'z': 20, 'z': 30} - ## - ## for v in a.allValues('z'): - ## echo v - ## # 10 - ## # 20 - ## # 30 + runnableExamples: + import testutils + var a = {'a': 3, 'b': 5}.toTable + for i in 1..3: a.add('z', 10*i) + doAssert a.sortedPairs == @[('a', 3), ('b', 5), ('z', 10), ('z', 20), ('z', 30)] + doAssert sortedItems(a.allValues('z')) == @[10, 20, 30] + let hc = genHash(key) var h: Hash = hc and high(t.data) let L = len(t) @@ -806,8 +800,8 @@ iterator allValues*[A, B](t: Table[A, B]; key: A): B = var num = 0 var cache: seq[Hash] - while isFilled(t.data[h].hcode): # CHECKME isFilledAndValid ?? - if t.data[h].key == key: + while isFilled(t.data[h].hcode): # `isFilledAndValid` would be incorrect, see test for `allValues` + if t.data[h].hcode == hc and t.data[h].key == key: if not hasKeyOrPutCache(cache, h): yield t.data[h].val assert(len(t) == L, "the length of the table changed while iterating over it") diff --git a/lib/pure/testutils.nim b/lib/pure/testutils.nim index 7e56064975372..ddbbc7dec1075 100644 --- a/lib/pure/testutils.nim +++ b/lib/pure/testutils.nim @@ -6,3 +6,8 @@ proc sortedPairs*[T](t: T): auto = ## helps when writing tests involving tables in a way that's robust to ## changes in hashing functions / table implementation. toSeq(t.pairs).sorted + +template sortedItems*(t: untyped): untyped = + ## helps when writing tests involving tables in a way that's robust to + ## changes in hashing functions / table implementation. + sorted(toSeq(t)) diff --git a/tests/collections/ttables.nim b/tests/collections/ttables.nim index 304e99795b000..7459b3f137c4b 100644 --- a/tests/collections/ttables.nim +++ b/tests/collections/ttables.nim @@ -193,6 +193,23 @@ block ttables2: run1() echo "2" +block allValues: + var t: Table[int, string] + var key = 0 + let n = 1000 + for i in 0.. Date: Tue, 18 Feb 2020 19:12:33 -0800 Subject: [PATCH 12/19] fixup --- lib/pure/collections/tables.nim | 2 +- lib/pure/hashes.nim | 21 ++------------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index 29340dc9e8f6d..07058d4990695 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -2250,7 +2250,7 @@ proc ctRawInsert[A](t: CountTable[A], data: var seq[tuple[key: A, val: int]], key: A, val: int) = var perturb = hash(key) var h: Hash = perturb and high(data) - while data[h].val != 0: h = nextTry(h, high(data), perturb) # TODO: hadnle deletedMarker + while data[h].val != 0: h = nextTry(h, high(data), perturb) # TODO: handle deletedMarker data[h].key = key data[h].val = val diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim index 3f805c937048a..b55a7865dd909 100644 --- a/lib/pure/hashes.nim +++ b/lib/pure/hashes.nim @@ -112,30 +112,13 @@ proc hash*[T: proc](x: T): Hash {.inline.} = else: result = hash(pointer(x)) -proc hashUInt32*(x: uint32): Hash {.inline.} = - ## for internal use; user code should prefer `hash` overloads - # calling `hashUInt64(x)` would perform 1.736 worse, see thashes_perf toInt32 - when nimvm: # in vmops - doAssert false - else: - # inspired from https://gist.github.com/badboy/6267743 - var x = x xor ((x shr 20) xor (x shr 12)) - result = cast[Hash](x xor (x shr 7) xor (x shr 4)) - -const prime = uint(11) proc hash*(x: int|int64|uint|uint64|char|Ordinal): Hash {.inline.} = ## Efficient hashing of integers. - when sizeof(x) < 4: - cast[Hash](x) - elif sizeof(x) == 4: - cast[Hash](cast[uint](x) * prime) - # hashUInt32(x) - else: - cast[Hash](cast[uint](x) * prime) + cast[Hash](ord(x)) proc hash*(x: float): Hash {.inline.} = ## Efficient hashing of floats. - var y = x + 0.0 + var y = x + 0.0 # for denormalization result = hash(cast[ptr Hash](addr(y))[]) # Forward declarations before methods that hash containers. This allows From a77e64679dc4e54fdae0e65e0c62d338b9b39aaa Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Tue, 18 Feb 2020 21:11:27 -0800 Subject: [PATCH 13/19] use translateBits optimization that further helps with #13393 --- lib/pure/collections/hashcommon.nim | 31 +++++++++++++++++++++------ lib/pure/collections/intsets.nim | 2 ++ lib/pure/collections/sharedtables.nim | 2 +- lib/pure/collections/tableimpl.nim | 2 +- lib/pure/collections/tables.nim | 10 ++++----- 5 files changed, 33 insertions(+), 14 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index d7b0e73e397de..13fb73a6dca0c 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -10,9 +10,13 @@ # An ``include`` file which contains common code for # hash sets and tables. +# type Hash = uint + const growthFactor = 2 +from bitops import fastLog2 + when not defined(nimHasDefault): template default[T](t: typedesc[T]): T = var v: T @@ -29,13 +33,19 @@ proc isFilledAndValid(hcode: Hash): bool {.inline.} = proc isFilled(hcode: Hash): bool {.inline.} = result = hcode != 0 +type UHash* = uint -proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = - const PERTURB_SHIFT = 5 - # TODO: make perturb (maybe even Hash) unsigned everywhere to avoid back and forth conversions - var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT - perturb = cast[Hash](perturb2) - result = ((5*h) + 1 + perturb) and maxHash +proc translateBits(a: UHash, numBitsMask: int): UHash {.inline.} = + result = (a shr numBitsMask) or (a shl (UHash.sizeof * 8 - numBitsMask)) + +proc nextTry(h, maxHash: Hash, perturb: var UHash): Hash {.inline.} = + # an optimization would be to use `(h + 1) and maxHash` for a few iterations + # and then switch to the formula below, to get "best of both worlds": good + # cache locality, except when a collision cluster is detected (ie, large number + # of iterations). + const PERTURB_SHIFT = 5 # consider using instead `numBitsMask=fastLog2(t.dataLen)` + result = cast[Hash]((5*cast[uint](h) + 1 + perturb) and cast[uint](maxHash)) + perturb = perturb shr PERTURB_SHIFT proc mustRehash(length, counter: int): bool {.inline.} = assert(length > counter) @@ -45,11 +55,18 @@ proc mustRehash2[T](t: T): bool {.inline.} = let counter2 = t.counter + t.countDeleted result = mustRehash(t.dataLen, counter2) +template getPerturb*(t: typed, hc: Hash): UHash = + let numBitsMask=fastLog2(dataLen(t)) # TODO: cache/store in t if needed + # this makes a major difference for cases like #13393; it causes the bits + # that were masked out in 1st position so they'll be masked in instead, and + # influence the recursion in nextTry earlier rather than later. + translateBits(cast[uint](hc), numBitsMask) + template rawGetKnownHCImpl() {.dirty.} = if t.dataLen == 0: return -1 var h: Hash = hc and maxHash(t) # start with real hash value - var perturb = hc + var perturb = t.getPerturb(hc) var deletedIndex = -1 while true: if isFilledAndValid(t.data[h].hcode): diff --git a/lib/pure/collections/intsets.nim b/lib/pure/collections/intsets.nim index f6aa2b0c27a65..a8b804be55607 100644 --- a/lib/pure/collections/intsets.nim +++ b/lib/pure/collections/intsets.nim @@ -22,6 +22,8 @@ import hashes +# from hashcommon import UHash + type BitScalar = uint diff --git a/lib/pure/collections/sharedtables.nim b/lib/pure/collections/sharedtables.nim index 2a48e60f03559..f25267e109250 100644 --- a/lib/pure/collections/sharedtables.nim +++ b/lib/pure/collections/sharedtables.nim @@ -50,7 +50,7 @@ proc enlarge[A, B](t: var SharedTable[A, B]) = for i in 0.. Date: Tue, 18 Feb 2020 21:26:44 -0800 Subject: [PATCH 14/19] fixup --- lib/pure/collections/tables.nim | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index bbd704fc265de..0502f615cf75f 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -2248,8 +2248,9 @@ type proc ctRawInsert[A](t: CountTable[A], data: var seq[tuple[key: A, val: int]], key: A, val: int) = - var perturb = t.getPerturb(hash(key)) - var h: Hash = perturb and high(data) + let hc = hash(key) + var perturb = t.getPerturb(hc) + var h: Hash = hc and high(data) while data[h].val != 0: h = nextTry(h, high(data), perturb) # TODO: handle deletedMarker data[h].key = key data[h].val = val @@ -2277,8 +2278,9 @@ proc remove[A](t: var CountTable[A], key: A) = proc rawGet[A](t: CountTable[A], key: A): int = if t.data.len == 0: return -1 - var perturb = t.getPerturb(hash(key)) - var h: Hash = perturb and high(t.data) # start with real hash value + let hc = hash(key) + var perturb = t.getPerturb(hc) + var h: Hash = hc and high(t.data) # start with real hash value while t.data[h].val != 0: # TODO: may need to handle t.data[h].hcode == deletedMarker? if t.data[h].key == key: return h h = nextTry(h, high(t.data), perturb) From 5b639730490266191d96cd1b8694557394223970 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Tue, 18 Feb 2020 22:04:31 -0800 Subject: [PATCH 15/19] _ --- lib/pure/collections/hashcommon.nim | 9 ++++----- lib/pure/collections/intsets.nim | 2 -- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 13fb73a6dca0c..672f03e371c30 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -10,13 +10,9 @@ # An ``include`` file which contains common code for # hash sets and tables. -# type Hash = uint - const growthFactor = 2 -from bitops import fastLog2 - when not defined(nimHasDefault): template default[T](t: typedesc[T]): T = var v: T @@ -55,8 +51,11 @@ proc mustRehash2[T](t: T): bool {.inline.} = let counter2 = t.counter + t.countDeleted result = mustRehash(t.dataLen, counter2) +# from bitops import fastLog2 # PRTEMP + template getPerturb*(t: typed, hc: Hash): UHash = - let numBitsMask=fastLog2(dataLen(t)) # TODO: cache/store in t if needed + # let numBitsMask = fastLog2(dataLen(t)) # TODO: cache/store in t if needed + let numBitsMask = 16 # PRTEMP can't use fastLog2 # this makes a major difference for cases like #13393; it causes the bits # that were masked out in 1st position so they'll be masked in instead, and # influence the recursion in nextTry earlier rather than later. diff --git a/lib/pure/collections/intsets.nim b/lib/pure/collections/intsets.nim index a8b804be55607..f6aa2b0c27a65 100644 --- a/lib/pure/collections/intsets.nim +++ b/lib/pure/collections/intsets.nim @@ -22,8 +22,6 @@ import hashes -# from hashcommon import UHash - type BitScalar = uint From d52d5e3f3bb3678c38bb3ed3109f66021e0e2969 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Tue, 18 Feb 2020 22:49:01 -0800 Subject: [PATCH 16/19] use sizeof(Hash) * 4 instead of fastLog2(dataLen(t)) --- lib/pure/collections/hashcommon.nim | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 672f03e371c30..748fa84e083cd 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -39,7 +39,7 @@ proc nextTry(h, maxHash: Hash, perturb: var UHash): Hash {.inline.} = # and then switch to the formula below, to get "best of both worlds": good # cache locality, except when a collision cluster is detected (ie, large number # of iterations). - const PERTURB_SHIFT = 5 # consider using instead `numBitsMask=fastLog2(t.dataLen)` + const PERTURB_SHIFT = 5 # consider tying this to `numBitsMask = fastLog2(t.dataLen)` result = cast[Hash]((5*cast[uint](h) + 1 + perturb) and cast[uint](maxHash)) perturb = perturb shr PERTURB_SHIFT @@ -51,11 +51,10 @@ proc mustRehash2[T](t: T): bool {.inline.} = let counter2 = t.counter + t.countDeleted result = mustRehash(t.dataLen, counter2) -# from bitops import fastLog2 # PRTEMP - template getPerturb*(t: typed, hc: Hash): UHash = - # let numBitsMask = fastLog2(dataLen(t)) # TODO: cache/store in t if needed - let numBitsMask = 16 # PRTEMP can't use fastLog2 + # we can't use `fastLog2(dataLen(t))` because importing `bitops` would cause codegen errors + # so we use a practical value of half the bit width (eg 64 / 2 = 32 on 64bit machines) + let numBitsMask = sizeof(Hash) * 4 # ie, sizeof(Hash) * 8 / 2 # this makes a major difference for cases like #13393; it causes the bits # that were masked out in 1st position so they'll be masked in instead, and # influence the recursion in nextTry earlier rather than later. From 3f956aa3812cfc2322ba11b16b3977e4039d014d Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Tue, 18 Feb 2020 23:24:08 -0800 Subject: [PATCH 17/19] fixup --- lib/pure/collections/hashcommon.nim | 5 +++-- lib/pure/collections/tableimpl.nim | 5 +---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 748fa84e083cd..b2b4b57d66c7a 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -24,7 +24,8 @@ const deletedMarker = -1 # hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These # two procs retain clarity of that encoding without the space cost of an enum. proc isFilledAndValid(hcode: Hash): bool {.inline.} = - result = hcode != 0 and hcode != deletedMarker # SPEED: could improve w bit magic + result = hcode != 0 and hcode != deletedMarker + # performance: we could use bit magic if needed proc isFilled(hcode: Hash): bool {.inline.} = result = hcode != 0 @@ -72,7 +73,7 @@ template rawGetKnownHCImpl() {.dirty.} = # zero ==key's for missing (e.g.inserts) and exactly one ==key for present. # It does slow down succeeding lookups by one extra Hash cmp&and..usually # just a few clock cycles, generally worth it for any non-integer-like A. - # TODO: optimize this: depending on type(key), skip hc comparison + # performance: we optimize this: depending on type(key), skip hc comparison if t.data[h].hcode == hc and t.data[h].key == key: return h h = nextTry(h, maxHash(t), perturb) diff --git a/lib/pure/collections/tableimpl.nim b/lib/pure/collections/tableimpl.nim index c9ab975f08a5d..df05c3bb87b77 100644 --- a/lib/pure/collections/tableimpl.nim +++ b/lib/pure/collections/tableimpl.nim @@ -92,10 +92,7 @@ template delImplIdx(t, i) = t.data[i].hcode = deletedMarker t.data[i].key = default(type(t.data[i].key)) t.data[i].val = default(type(t.data[i].val)) - # maybe not needed because counter+countDeleted doesn't change, unless formula is not a simple + - # if mustRehash2(t): - # enlarge(t) # RENAME enlarge... - # i = rawGetKnownHC(t, key, hc) # TODO + # mustRehash2 + enlarge not needed because counter+countDeleted doesn't change template delImpl() {.dirty.} = var hc: Hash From f45e1f53cbde9cb1d0b206cc540f3ae399228e30 Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Wed, 19 Feb 2020 01:18:21 -0800 Subject: [PATCH 18/19] address comments --- lib/pure/collections/hashcommon.nim | 3 ++- lib/pure/testutils.nim | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index b2b4b57d66c7a..95ecd8447067b 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -21,6 +21,8 @@ when not defined(nimHasDefault): const freeMarker = 0 const deletedMarker = -1 +type UHash = uint + # hcode for real keys cannot be zero. hcode==0 signifies an empty slot. These # two procs retain clarity of that encoding without the space cost of an enum. proc isFilledAndValid(hcode: Hash): bool {.inline.} = @@ -30,7 +32,6 @@ proc isFilledAndValid(hcode: Hash): bool {.inline.} = proc isFilled(hcode: Hash): bool {.inline.} = result = hcode != 0 -type UHash* = uint proc translateBits(a: UHash, numBitsMask: int): UHash {.inline.} = result = (a shr numBitsMask) or (a shl (UHash.sizeof * 8 - numBitsMask)) diff --git a/lib/pure/testutils.nim b/lib/pure/testutils.nim index ddbbc7dec1075..bdf90b61635e2 100644 --- a/lib/pure/testutils.nim +++ b/lib/pure/testutils.nim @@ -1,4 +1,6 @@ -## utilities to help writing tests +## Utilities to help writing tests +## +## Unstable, experimental API. import std/[sequtils, algorithm] From 8e0bf1c9ffe0e24b5e2ff9a9325480ce6102df3a Mon Sep 17 00:00:00 2001 From: Timothee Cour Date: Wed, 19 Feb 2020 05:05:37 -0800 Subject: [PATCH 19/19] refactorings: remove a mustRehash overload + other small refactorings --- lib/pure/collections/hashcommon.nim | 26 ++++++++++++++++------- lib/pure/collections/intsets.nim | 17 +++++++-------- lib/pure/collections/setimpl.nim | 4 ++-- lib/pure/collections/sets.nim | 30 ++++++++++++--------------- lib/pure/collections/sharedtables.nim | 2 +- lib/pure/collections/tableimpl.nim | 8 +++---- lib/pure/collections/tables.nim | 11 ---------- 7 files changed, 45 insertions(+), 53 deletions(-) diff --git a/lib/pure/collections/hashcommon.nim b/lib/pure/collections/hashcommon.nim index 95ecd8447067b..bfc09ec3cc92e 100644 --- a/lib/pure/collections/hashcommon.nim +++ b/lib/pure/collections/hashcommon.nim @@ -37,6 +37,7 @@ proc translateBits(a: UHash, numBitsMask: int): UHash {.inline.} = result = (a shr numBitsMask) or (a shl (UHash.sizeof * 8 - numBitsMask)) proc nextTry(h, maxHash: Hash, perturb: var UHash): Hash {.inline.} = + # FACTOR between hashcommon.nextTry, intsets.nextTry # an optimization would be to use `(h + 1) and maxHash` for a few iterations # and then switch to the formula below, to get "best of both worlds": good # cache locality, except when a collision cluster is detected (ie, large number @@ -45,15 +46,24 @@ proc nextTry(h, maxHash: Hash, perturb: var UHash): Hash {.inline.} = result = cast[Hash]((5*cast[uint](h) + 1 + perturb) and cast[uint](maxHash)) perturb = perturb shr PERTURB_SHIFT -proc mustRehash(length, counter: int): bool {.inline.} = - assert(length > counter) - result = (length * 2 < counter * 3) or (length - counter < 4) # synchronize with `rightSize` - -proc mustRehash2[T](t: T): bool {.inline.} = +proc mustRehash[T](t: T): bool {.inline.} = + # FACTOR between hashcommon.mustRehash, intsets.mustRehash let counter2 = t.counter + t.countDeleted - result = mustRehash(t.dataLen, counter2) - -template getPerturb*(t: typed, hc: Hash): UHash = + let length = t.dataLen + assert(length > counter2) + result = (length * 2 < counter2 * 3) or (length - counter2 < 4) # synchronize with `rightSize` + +proc rightSize*(count: Natural): int {.inline.} = + ## Return the value of `initialSize` to support `count` items. + ## + ## If more items are expected to be added, simply add that + ## expected extra amount to the parameter before calling this. + ## + ## Internally, we want `mustRehash(t) == false` for t that was just resized. + # Make sure to synchronize with `mustRehash` + result = nextPowerOfTwo(count * 3 div 2 + 4) + +template getPerturb(t: typed, hc: Hash): UHash = # we can't use `fastLog2(dataLen(t))` because importing `bitops` would cause codegen errors # so we use a practical value of half the bit width (eg 64 / 2 = 32 on 64bit machines) let numBitsMask = sizeof(Hash) * 4 # ie, sizeof(Hash) * 8 / 2 diff --git a/lib/pure/collections/intsets.nim b/lib/pure/collections/intsets.nim index f6aa2b0c27a65..7ca3797835a39 100644 --- a/lib/pure/collections/intsets.nim +++ b/lib/pure/collections/intsets.nim @@ -51,18 +51,15 @@ type data: TrunkSeq a: array[0..33, int] # profiling shows that 34 elements are enough -proc mustRehash(length, counter: int): bool {.inline.} = - # FACTOR with hashcommon.mustRehash - assert(length > counter) - result = (length * 2 < counter * 3) or (length - counter < 4) - -proc mustRehash2[T](t: T): bool {.inline.} = - # FACTOR with hashcommon.mustRehash2 +proc mustRehash[T](t: T): bool {.inline.} = + # FACTOR between hashcommon.mustRehash, intsets.mustRehash let counter2 = t.counter + t.countDeleted - result = mustRehash(t.max + 1, counter2) + let length = t.max + 1 + assert length > counter2 + result = (length * 2 < counter2 * 3) or (length - counter2 < 4) proc nextTry(h, maxHash: Hash, perturb: var Hash): Hash {.inline.} = - # FACTOR with hashcommon.nextTry + # FACTOR between hashcommon.nextTry, intsets.nextTry const PERTURB_SHIFT = 5 var perturb2 = cast[uint](perturb) shr PERTURB_SHIFT perturb = cast[Hash](perturb2) @@ -102,7 +99,7 @@ proc intSetPut(t: var IntSet, key: int): PTrunk = if t.data[h].key == key: return t.data[h] h = nextTry(h, t.max, perturb) - if mustRehash2(t): intSetEnlarge(t) + if mustRehash(t): intSetEnlarge(t) inc(t.counter) h = key and t.max perturb = key diff --git a/lib/pure/collections/setimpl.nim b/lib/pure/collections/setimpl.nim index e372f3d4c1e6d..f8950e3546b3e 100644 --- a/lib/pure/collections/setimpl.nim +++ b/lib/pure/collections/setimpl.nim @@ -48,7 +48,7 @@ template inclImpl() {.dirty.} = var hc: Hash var index = rawGet(s, key, hc) if index < 0: - if mustRehash(len(s.data), s.counter): + if mustRehash(s): enlarge(s) index = rawGetKnownHC(s, key, hc) rawInsert(s, s.data, key, hc, -1 - index) @@ -62,7 +62,7 @@ template containsOrInclImpl() {.dirty.} = if index >= 0: result = true else: - if mustRehash(len(s.data), s.counter): + if mustRehash(s): enlarge(s) index = rawGetKnownHC(s, key, hc) rawInsert(s, s.data, key, hc, -1 - index) diff --git a/lib/pure/collections/sets.nim b/lib/pure/collections/sets.nim index 6edcd65abf282..9f800e6d1f702 100644 --- a/lib/pure/collections/sets.nim +++ b/lib/pure/collections/sets.nim @@ -88,9 +88,6 @@ const include setimpl -proc rightSize*(count: Natural): int {.inline.} - - # --------------------------------------------------------------------- # ------------------------------ HashSet ------------------------------ # --------------------------------------------------------------------- @@ -595,16 +592,6 @@ proc `$`*[A](s: HashSet[A]): string = ## # --> {no, esc'aping, is " provided} dollarImpl() -proc rightSize*(count: Natural): int {.inline.} = - ## Return the value of `initialSize` to support `count` items. - ## - ## If more items are expected to be added, simply add that - ## expected extra amount to the parameter before calling this. - ## - ## Internally, we want `mustRehash(rightSize(x), x) == false`. - result = nextPowerOfTwo(count * 3 div 2 + 4) - - proc initSet*[A](initialSize = defaultInitialSize): HashSet[A] {.deprecated: "Deprecated since v0.20, use 'initHashSet'".} = initHashSet[A](initialSize) @@ -1148,10 +1135,19 @@ when isMainModule and not defined(release): b.incl(2) assert b.len == 1 - for i in 0 .. 32: - var s = rightSize(i) - if s <= i or mustRehash(s, i): - echo "performance issue: rightSize() will not elide enlarge() at ", i + block: + type FakeTable = object + dataLen: int + counter: int + countDeleted: int + + var t: FakeTable + for i in 0 .. 32: + var s = rightSize(i) + t.dataLen = s + t.counter = i + doAssert s > i and not mustRehash(t), + "performance issue: rightSize() will not elide enlarge() at: " & $i block missingOrExcl: var s = toOrderedSet([2, 3, 6, 7]) diff --git a/lib/pure/collections/sharedtables.nim b/lib/pure/collections/sharedtables.nim index f25267e109250..0fbbdb3eb0e31 100644 --- a/lib/pure/collections/sharedtables.nim +++ b/lib/pure/collections/sharedtables.nim @@ -33,7 +33,7 @@ template maxHash(t): untyped = t.dataLen-1 include tableimpl template st_maybeRehashPutImpl(enlarge) {.dirty.} = - if mustRehash2(t): + if mustRehash(t): enlarge(t) index = rawGetKnownHC(t, key, hc) index = -1 - index # important to transform for mgetOrPutImpl diff --git a/lib/pure/collections/tableimpl.nim b/lib/pure/collections/tableimpl.nim index df05c3bb87b77..aabaeeeb37968 100644 --- a/lib/pure/collections/tableimpl.nim +++ b/lib/pure/collections/tableimpl.nim @@ -44,14 +44,14 @@ template checkIfInitialized() = template addImpl(enlarge) {.dirty.} = checkIfInitialized() - if mustRehash2(t): enlarge(t) + if mustRehash(t): enlarge(t) var hc: Hash var j = rawGetDeep(t, key, hc) rawInsert(t, t.data, key, val, hc, j) inc(t.counter) template maybeRehashPutImpl(enlarge) {.dirty.} = - if mustRehash2(t): + if mustRehash(t): enlarge(t) index = rawGetKnownHC(t, key, hc) index = -1 - index # important to transform for mgetOrPutImpl @@ -92,7 +92,7 @@ template delImplIdx(t, i) = t.data[i].hcode = deletedMarker t.data[i].key = default(type(t.data[i].key)) t.data[i].val = default(type(t.data[i].val)) - # mustRehash2 + enlarge not needed because counter+countDeleted doesn't change + # mustRehash + enlarge not needed because counter+countDeleted doesn't change template delImpl() {.dirty.} = var hc: Hash @@ -117,7 +117,7 @@ template initImpl(result: typed, size: int) = template insertImpl() = # for CountTable checkIfInitialized() - if mustRehash2(t): enlarge(t) + if mustRehash(t): enlarge(t) ctRawInsert(t, t.data, key, val) inc(t.counter) diff --git a/lib/pure/collections/tables.nim b/lib/pure/collections/tables.nim index 0502f615cf75f..bf08eaa92c446 100644 --- a/lib/pure/collections/tables.nim +++ b/lib/pure/collections/tables.nim @@ -251,8 +251,6 @@ template dataLen(t): untyped = len(t.data) include tableimpl -proc rightSize*(count: Natural): int {.inline.} - template get(t, key): untyped = ## retrieves the value at ``t[key]``. The value can be modified. ## If ``key`` is not in ``t``, the ``KeyError`` exception is raised. @@ -582,15 +580,6 @@ proc `==`*[A, B](s, t: Table[A, B]): bool = equalsImpl(s, t) -proc rightSize*(count: Natural): int {.inline.} = - ## Return the value of ``initialSize`` to support ``count`` items. - ## - ## If more items are expected to be added, simply add that - ## expected extra amount to the parameter before calling this. - ## - ## Internally, we want mustRehash(rightSize(x), x) == false. - result = nextPowerOfTwo(count * 3 div 2 + 4) - proc indexBy*[A, B, C](collection: A, index: proc(x: B): C): Table[C, B] = ## Index the collection with the proc provided. # TODO: As soon as supported, change collection: A to collection: A[B]