Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[superseded] Different Hashing to avoid Collisions. #11767

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions compiler/vmops.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ from math import sqrt, ln, log10, log2, exp, round, arccos, arcsin,
from os import getEnv, existsEnv, dirExists, fileExists, putEnv, walkDir, getAppFilename
from md5 import getMD5
from sighashes import symBodyDigest
from std/hashes import hashBiggestInt

template mathop(op) {.dirty.} =
registerCallback(c, "stdlib.math." & astToStr(op), `op Wrapper`)
Expand Down Expand Up @@ -144,3 +145,6 @@ proc registerAdditionalOps*(c: PCtx) =
let n = getNode(a, 0)
if n.kind != nkSym: raise newException(ValueError, "node is not a symbol")
setResult(a, $symBodyDigest(c.graph, n.sym))

registerCallback c, "stdlib.hashes.hashBiggestIntVM", proc (a: VmArgs) {.nimcall.} =
a.setResult hashBiggestInt(getInt(a, 0))
76 changes: 45 additions & 31 deletions lib/pure/hashes.nim
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ proc `!&`*(h: Hash, val: int): Hash {.inline.} =
## Mixes a hash value `h` with `val` to produce a new hash value.
##
## This is only needed if you need to implement a hash proc for a new datatype.
## Uses Jenkins hash: https://en.wikipedia.org/wiki/Jenkins_hash_function
let h = cast[uint](h)
let val = cast[uint](val)
var res = h + val
Expand All @@ -79,6 +80,9 @@ proc `!$`*(h: Hash): Hash {.inline.} =

proc hashData*(data: pointer, size: int): Hash =
## Hashes an array of bytes of size `size`.
# this should probably be merged/refactored with
# `proc hash*[A](aBuf: openArray[A], sPos, ePos: int): Hash =` to avoid
# using 2 different algorithms
var h: Hash = 0
when defined(js):
var p: cstring
Expand All @@ -93,6 +97,43 @@ proc hashData*(data: pointer, size: int): Hash =
dec(s)
result = !$h

proc hashBiggestIntVM(x: BiggestInt): Hash = discard # in vmops

proc hashBiggestInt*(x: BiggestInt): Hash {.inline.} =
## for internal use; user code should prefer `hash` overloads
when nimvm: hashBiggestIntVM(x)
else: hashData(cast[pointer](unsafeAddr x), type(x).sizeof)

proc hash*[T: SomeNumber | Ordinal | char](x: T): Hash {.inline.} =
## Efficient hashing of numbers, ordinals (eg enum), char.
when T.sizeof >= 4:
# fix #11764: `ord(x)`, `toU32(x)` or similar are up to 4X faster to compute
# compared to jenkins `hashData` but result in very poor hashes, leading to
# collisions; this can lead to several order magnitude (eg 1e3) slowdowns
# e.g. when used in hash tables, so we prefer to use slower to compute good
# hashes here. Murmur3 would improve speed of hash computation.
when T is SomeFloat:
# 0.0 vs -0.0 should map to same hash to avoid weird behavior.
# the only non nan value that can cause clash is 0 according to
# https://stackoverflow.com/questions/31087915/are-there-denormalized-floats-that-evaluate-to-the-same-value-apart-from-0-0
# bugfix: the previous code was using `x = x + 1.0` (presumably for
# handling negative 0), however this leads to collisions for small x due
# to FP finite precision.
let x: BiggestInt =
if x == 0: 0.BiggestInt
else:
when sizeof(BiggestInt) == sizeof(T):
cast[BiggestInt](x)
else: # for nimvm
cast[int32](x).BiggestInt
else:
let x = x.BiggestInt
hashBiggestInt(x)
else:
# empirically better for small types, the collision risk is limited anyway
# due to cardinality of at most 2^16=65536
ord(x)

when defined(js):
var objectID = 0

Expand All @@ -110,7 +151,10 @@ proc hash*(x: pointer): Hash {.inline.} =
}
"""
else:
result = cast[Hash](cast[uint](x) shr 3) # skip the alignment
# 2 bug fixes: s/cast[Hash]()/hash()/ (#11764); and also s/uint/BiggestInt/
# note that we can't use unsigned because nimscript doesn't have `$`(uint)
result = hash(cast[ByteAddress](x) shr 3) # skip the alignment
# CHECKME: why? isn't that responsability of caller if he needs this behavior?

when not defined(booting):
proc hash*[T: proc](x: T): Hash {.inline.} =
Expand All @@ -120,36 +164,6 @@ when not defined(booting):
else:
result = hash(pointer(x))

proc hash*(x: int): Hash {.inline.} =
## Efficient hashing of integers.
result = x

proc hash*(x: int64): Hash {.inline.} =
## Efficient hashing of `int64` integers.
result = toU32(x)

proc hash*(x: uint): Hash {.inline.} =
## Efficient hashing of unsigned integers.
result = cast[int](x)

proc hash*(x: uint64): Hash {.inline.} =
## Efficient hashing of `uint64` integers.
result = toU32(cast[int](x))

proc hash*(x: char): Hash {.inline.} =
## Efficient hashing of characters.
result = ord(x)

proc hash*[T: Ordinal](x: T): Hash {.inline.} =
## Efficient hashing of other ordinal types (e.g. enums).
result = ord(x)

proc hash*(x: float): Hash {.inline.} =
## Efficient hashing of floats.
var y = x + 1.0
result = cast[ptr Hash](addr(y))[]


# Forward declarations before methods that hash containers. This allows
# containers to contain other containers
proc hash*[A](x: openArray[A]): Hash
Expand Down
7 changes: 4 additions & 3 deletions tests/collections/ttables.nim
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,10 @@ block tableconstr:
block ttables2:
proc TestHashIntInt() =
var tab = initTable[int,int]()
for i in 1..1_000_000:
const n = 1_000_000 # bottleneck: 50 seconds on OSX in debug mode
for i in 1..n:
tab[i] = i
for i in 1..1_000_000:
for i in 1..n:
var x = tab[i]
if x != i : echo "not found ", i

Expand Down Expand Up @@ -233,7 +234,7 @@ block tablesref:
for y in 0..1:
assert t[(x,y)] == $x & $y
assert($t ==
"{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}")
"""{(x: 0, y: 1): "01", (x: 1, y: 0): "10", (x: 0, y: 0): "00", (x: 1, y: 1): "11"}""")

block tableTest2:
var t = newTable[string, float]()
Expand Down
2 changes: 1 addition & 1 deletion tests/collections/ttablesthreads.nim
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ block tableTest1:
for y in 0..1:
assert t[(x,y)] == $x & $y
assert($t ==
"{(x: 0, y: 1): \"01\", (x: 0, y: 0): \"00\", (x: 1, y: 0): \"10\", (x: 1, y: 1): \"11\"}")
"""{(x: 0, y: 0): "00", (x: 1, y: 0): "10", (x: 0, y: 1): "01", (x: 1, y: 1): "11"}""")

block tableTest2:
var t = initTable[string, float]()
Expand Down
26 changes: 26 additions & 0 deletions tests/vm/tcompiletimetable.nim
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,29 @@ addStuff("Hey"): echo "Hey"
addStuff("Hi"): echo "Hi"
dump()

import std/hashes
block:
# check CT vs RT produces same results for Table
template callFun(T) =
block:
proc fun(): string =
var t: Table[T, string]
let n = 10
for i in 0..<n:
let i2 = when T.sizeof == type(i).sizeof: i else: i.int32
let k = cast[T](i2)
# cast intentional for regression testing,
# producing small values
doAssert k notin t
t[k] = $(i, k)
doAssert k in t
$t
const s1 = fun()
let s2 = fun()
# echo s1 # for debugging
doAssert s1 == s2
doAssert s1 == s2
doAssert hash(0.0) == hash(-0.0)
callFun(float)
callFun(float32)
callFun(int64)