nim-lang · timotheecour · Jul 17, 2019 · Jul 17, 2019 · Jul 18, 2019 · Jul 18, 2019
diff --git a/compiler/vmops.nim b/compiler/vmops.nim
@@ -16,6 +16,7 @@ from math import sqrt, ln, log10, log2, exp, round, arccos, arcsin,
 from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getAppFilename
 from md5 import getMD5
 from sighashes import symBodyDigest
+from std/hashes import hashBiggestInt
 
 template mathop(op) {.dirty.} =
   registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
@@ -144,3 +145,6 @@ proc registerAdditionalOps*(c: PCtx) =
     let n = getNode(a, 0)
     if n.kind != nkSym: raise newException(ValueError, "node is not a symbol")
     setResult(a, $symBodyDigest(c.graph, n.sym))
+
+  registerCallback c, "stdlib.hashes.hashBiggestIntVM", proc (a: VmArgs) {.nimcall.} =
+    a.setResult hashBiggestInt(getInt(a, 0))
diff --git a/lib/pure/hashes.nim b/lib/pure/hashes.nim
@@ -60,6 +60,7 @@ proc `!&`*(h: Hash, val: int): Hash {.inline.} =
   ## Mixes a hash value `h` with `val` to produce a new hash value.
   ##
   ## This is only needed if you need to implement a hash proc for a new datatype.
+  ## Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function
   let h = cast[uint](h)
   let val = cast[uint](val)
   var res = h + val
@@ -79,6 +80,9 @@ proc `!$`*(h: Hash): Hash {.inline.} =
 
 proc hashData*(data: pointer, size: int): Hash =
   ## Hashes an array of bytes of size `size`.
+  # this should probably be merged/refactored with
+  # `proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =` to avoid
+  # using 2 different algorithms
   var h: Hash = 0
   when defined(js):
     var p: cstring
@@ -93,6 +97,43 @@ proc hashData*(data: pointer, size: int): Hash =
     dec(s)
   result = !$h
 
+proc hashBiggestIntVM(x: BiggestInt): Hash = discard # in vmops
+
+proc hashBiggestInt*(x: BiggestInt): Hash {.inline.} =
+  ## for internal use; user code should prefer `hash` overloads
+  when nimvm: hashBiggestIntVM(x)
+  else: hashData(cast[pointer](unsafeAddr x), type(x).sizeof)
+
+proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} =
+  ## Efficient hashing of numbers, ordinals (eg enum), char.
+  when T.sizeof >= 4:
+    # fix #11764: `ord(x)`, `toU32(x)` or similar are up to 4X faster to compute
+    # compared to jenkins `hashData` but result in very poor hashes, leading to
+    # collisions; this can lead to several order magnitude (eg 1e3) slowdowns
+    # e.g. when used in hash tables, so we prefer to use slower to compute good
+    # hashes here. Murmur3 would improve speed of hash computation.
+    when T is SomeFloat:
+      # 0.0 vs -0.0 should map to same hash to avoid weird behavior.
+      # the only non nan value that can cause clash is 0 according to
+      # https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0
+      # bugfix: the previous code was using `x = x + 1.0` (presumably for
+      # handling negative 0), however this leads to collisions for small x due
+      # to FP finite precision.
+      let x: BiggestInt =
+        if x == 0: 0.BiggestInt
+        else:
+          when sizeof(BiggestInt) == sizeof(T):
+            cast[BiggestInt](x)
+          else: # for nimvm
+            cast[int32](x).BiggestInt
+    else:
+      let x = x.BiggestInt
+    hashBiggestInt(x)
+  else:
+    # empirically better for small types, the collision risk is limited anyway
+    # due to cardinality of at most 2^16=65536
+    ord(x)
+
 when defined(js):
   var objectID = 0
 
@@ -110,7 +151,10 @@ proc hash*(x: pointer): Hash {.inline.} =
       }
     """
   else:
-    result = cast[Hash](cast[uint](x) shr 3) # skip the alignment
+    # 2 bug fixes: s/cast[Hash]()/hash()/ (#11764); and also s/uint/BiggestInt/
+    # note that we can't use unsigned because nimscript doesn't have `$`(uint)
+    result = hash(cast[ByteAddress](x) shr 3) # skip the alignment
+      # CHECKME: why? isn't that responsability of caller if he needs this behavior?
 
 when not defined(booting):
   proc hash*[T: proc](x: T): Hash {.inline.} =
@@ -120,36 +164,6 @@ when not defined(booting):
     else:
       result = hash(pointer(x))
 
-proc hash*(x: int): Hash {.inline.} =
-  ## Efficient hashing of integers.
-  result = x
-
-proc hash*(x: int64): Hash {.inline.} =
-  ## Efficient hashing of `int64` integers.
-  result = toU32(x)
-
-proc hash*(x: uint): Hash {.inline.} =
-  ## Efficient hashing of unsigned integers.
-  result = cast[int](x)
-
-proc hash*(x: uint64): Hash {.inline.} =
-  ## Efficient hashing of `uint64` integers.
-  result = toU32(cast[int](x))
-
-proc hash*(x: char): Hash {.inline.} =
-  ## Efficient hashing of characters.
-  result = ord(x)
-
-proc hash*[T: Ordinal](x: T): Hash {.inline.} =
-  ## Efficient hashing of other ordinal types (e.g. enums).
-  result = ord(x)
-
-proc hash*(x: float): Hash {.inline.} =
-  ## Efficient hashing of floats.
-  var y = x + 1.0
-  result = cast[ptr Hash](addr(y))[]
-
-
 # Forward declarations before methods that hash containers. This allows
 # containers to contain other containers
 proc hash*[A](x: openArray[A]): Hash

diff --git a/tests/collections/ttables.nim b/tests/collections/ttables.nim
@@ -165,9 +165,10 @@ block tableconstr:
 block ttables2:
   proc TestHashIntInt() =
     var tab = initTable[int,int]()
-    for i in 1..1_000_000:
+    const n = 1_000_000 # bottleneck: 50 seconds on OSX in debug mode
+    for i in 1..n:
       tab[i] = i
-    for i in 1..1_000_000:
+    for i in 1..n:
       var x = tab[i]
       if x != i : echo "not found ", i
 
@@ -233,7 +234,7 @@ block tablesref:
       for y in 0..1:
         assert t[(x,y)] == $x & $y
     assert($t ==
-      "{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}")
+      """{(x: 0, y: 1): "01", (x: 1, y: 0): "10", (x: 0, y: 0): "00", (x: 1, y: 1): "11"}""")
 
   block tableTest2:
     var t = newTable[string, float]()

diff --git a/tests/collections/ttablesthreads.nim b/tests/collections/ttablesthreads.nim
@@ -48,7 +48,7 @@ block tableTest1:
     for y in 0..1:
       assert t[(x,y)] == $x & $y
   assert($t ==
-    "{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}")
+    """{(x: 0, y: 0): "00", (x: 1, y: 0): "10", (x: 0, y: 1): "01", (x: 1, y: 1): "11"}""")
 
 block tableTest2:
   var t = initTable[string, float]()

diff --git a/tests/vm/tcompiletimetable.nim b/tests/vm/tcompiletimetable.nim
@@ -47,3 +47,29 @@ addStuff("Hey"): echo "Hey"
 addStuff("Hi"): echo "Hi"
 dump()
 
+import std/hashes
+block:
+  # check CT vs RT produces same results for Table
+  template callFun(T) =
+    block:
+      proc fun(): string =
+        var t: Table[T, string]
+        let n = 10
+        for i in 0..<n:
+          let i2 = when T.sizeof == type(i).sizeof: i else: i.int32
+          let k = cast[T](i2)
+            # cast intentional for regression testing,
+            # producing small values
+          doAssert k notin t
+          t[k] = $(i, k)
+          doAssert k in t
+        $t
+      const s1 = fun()
+      let s2 = fun()
+      # echo s1 # for debugging
+      doAssert s1 == s2
+      doAssert s1 == s2
+      doAssert hash(0.0) == hash(-0.0)
+  callFun(float)
+  callFun(float32)
+  callFun(int64)