bytes/hash: add hashing package for bytes and strings

Fixes #28322 R=go1.14 RELNOTE=yes Change-Id: Ic29f8b587c8c77472260836a5c3e13edaded13fa Reviewed-on: https://go-review.googlesource.com/c/go/+/186877 Reviewed-by: Alan Donovan <adonovan@google.com>
golang · Aug 29, 2019 · bf36219 · bf36219
1 parent fbfb41e
commit bf36219
Show file tree

Hide file tree

Showing 4 changed files with 764 additions and 50 deletions.
diff --git a/src/bytes/hash/hash.go b/src/bytes/hash/hash.go
@@ -0,0 +1,185 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package bytes/hash provides hash functions on byte sequences. These
+// hash functions are intended to be used to implement hash tables or
+// other data structures that need to map arbitrary strings or byte
+// sequences to a uniform distribution of integers. The hash functions
+// are collision-resistant but are not cryptographically secure (use
+// one of the hash functions in crypto/* if you need that).
+//
+// The produced hashes depend only on the sequence of bytes provided
+// to the Hash object, not on the way in which they are provided. For
+// example, the calls
+//     h.AddString("foo")
+//     h.AddBytes([]byte{'f','o','o'})
+//     h.AddByte('f'); h.AddByte('o'); h.AddByte('o')
+// will all have the same effect.
+//
+// Two Hash instances in the same process using the same seed
+// behave identically.
+//
+// Two Hash instances with the same seed in different processes are
+// not guaranteed to behave identically, even if the processes share
+// the same binary.
+//
+// Hashes are intended to be collision-resistant, even for situations
+// where an adversary controls the byte sequences being hashed.
+// All bits of the Hash result are close to uniformly and
+// independently distributed, so can be safely restricted to a range
+// using bit masking, shifting, or modular arithmetic.
+package hash
+
+import (
+	"unsafe"
+)
+
+// A Seed controls the behavior of a Hash.  Two Hash objects with the
+// same seed in the same process will behave identically.  Two Hash
+// objects with different seeds will very likely behave differently.
+type Seed struct {
+	s uint64
+}
+
+// A Hash object is used to compute the hash of a byte sequence.
+type Hash struct {
+	seed  Seed     // initial seed used for this hash
+	state Seed     // current hash of all flushed bytes
+	buf   [64]byte // unflushed byte buffer
+	n     int      // number of unflushed bytes
+}
+
+// AddByte adds b to the sequence of bytes hashed by h.
+func (h *Hash) AddByte(b byte) {
+	if h.n == len(h.buf) {
+		h.flush()
+	}
+	h.buf[h.n] = b
+	h.n++
+}
+
+// AddBytes adds b to the sequence of bytes hashed by h.
+func (h *Hash) AddBytes(b []byte) {
+	for h.n+len(b) > len(h.buf) {
+		k := copy(h.buf[h.n:], b)
+		h.n = len(h.buf)
+		b = b[k:]
+		h.flush()
+	}
+	h.n += copy(h.buf[h.n:], b)
+}
+
+// AddString adds the bytes of s to the sequence of bytes hashed by h.
+func (h *Hash) AddString(s string) {
+	for h.n+len(s) > len(h.buf) {
+		k := copy(h.buf[h.n:], s)
+		h.n = len(h.buf)
+		s = s[k:]
+		h.flush()
+	}
+	h.n += copy(h.buf[h.n:], s)
+}
+
+// Seed returns the seed value specified in the most recent call to
+// SetSeed, or the initial seed if SetSeed was never called.
+func (h *Hash) Seed() Seed {
+	return h.seed
+}
+
+// SetSeed sets the seed used by h. Two Hash objects with the same
+// seed in the same process will behave identically.  Two Hash objects
+// with different seeds will very likely behave differently.  Any
+// bytes added to h previous to this call will be discarded.
+func (h *Hash) SetSeed(seed Seed) {
+	h.seed = seed
+	h.state = seed
+	h.n = 0
+}
+
+// Reset discards all bytes added to h.
+// (The seed remains the same.)
+func (h *Hash) Reset() {
+	h.state = h.seed
+	h.n = 0
+}
+
+// precondition: buffer is full.
+func (h *Hash) flush() {
+	if h.n != len(h.buf) {
+		panic("flush of partially full buffer")
+	}
+	h.state.s = rthash(h.buf[:], h.state.s)
+	h.n = 0
+}
+
+// Hash returns a value which depends on h's seed and the sequence of
+// bytes added to h (since the last call to Reset or SetSeed).
+func (h *Hash) Hash() uint64 {
+	return rthash(h.buf[:h.n], h.state.s)
+}
+
+// MakeSeed returns a Seed initialized using the bits in s.
+// Two seeds generated with the same s are guaranteed to be equal.
+// Two seeds generated with different s are very likely to be different.
+// TODO: disallow this? See Alan's comment in the issue.
+func MakeSeed(s uint64) Seed {
+	return Seed{s: s}
+}
+
+// New returns a new Hash object. Different hash objects allocated by
+// this function will very likely have different seeds.
+func New() *Hash {
+	seed := Seed{s: uint64(runtime_fastrand())}
+	return &Hash{
+		seed:  seed,
+		state: seed,
+	}
+}
+
+//go:linkname runtime_fastrand runtime.fastrand
+func runtime_fastrand() uint32
+
+func rthash(b []byte, seed uint64) uint64 {
+	if len(b) == 0 {
+		return seed
+	}
+	// The runtime hasher only works on uintptr. For 64-bit
+	// architectures, we use the hasher directly. Otherwise,
+	// we use two parallel hashers on the lower and upper 32 bits.
+	if unsafe.Sizeof(uintptr(0)) == 8 {
+		return uint64(runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed), uintptr(len(b))))
+	}
+	lo := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed), uintptr(len(b)))
+	hi := runtime_memhash(unsafe.Pointer(&b[0]), uintptr(seed>>32), uintptr(len(b)))
+	// TODO: mix lo/hi? Get 64 bits some other way?
+	return uint64(hi)<<32 | uint64(lo)
+}
+
+//go:linkname runtime_memhash runtime.memhash
+func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
+
+// Wrapper functions so that a bytes/hash.Hash implements
+// the hash.Hash and hash.Hash64 interfaces.
+
+func (h *Hash) Write(b []byte) (int, error) {
+	h.AddBytes(b)
+	return len(b), nil
+}
+func (h *Hash) Sum(b []byte) []byte {
+	x := h.Hash()
+	return append(b,
+		byte(x>>0),
+		byte(x>>8),
+		byte(x>>16),
+		byte(x>>24),
+		byte(x>>32),
+		byte(x>>40),
+		byte(x>>48),
+		byte(x>>56))
+}
+func (h *Hash) Sum64() uint64 {
+	return h.Hash()
+}
+func (h *Hash) Size() int      { return 8 }
+func (h *Hash) BlockSize() int { return len(h.buf) }
diff --git a/src/bytes/hash/hash_test.go b/src/bytes/hash/hash_test.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package hash_test
+
+import (
+	"bytes/hash"
+	basehash "hash"
+	"testing"
+)
+
+func TestUnseededHash(t *testing.T) {
+	m := map[uint64]struct{}{}
+	for i := 0; i < 1000; i++ {
+		h := hash.New()
+		m[h.Hash()] = struct{}{}
+	}
+	if len(m) < 900 {
+		t.Errorf("empty hash not sufficiently random: got %d, want 1000", len(m))
+	}
+}
+
+func TestSeededHash(t *testing.T) {
+	s := hash.MakeSeed(1234)
+	m := map[uint64]struct{}{}
+	for i := 0; i < 1000; i++ {
+		h := hash.New()
+		h.SetSeed(s)
+		m[h.Hash()] = struct{}{}
+	}
+	if len(m) != 1 {
+		t.Errorf("seeded hash is random: got %d, want 1", len(m))
+	}
+}
+
+func TestHashGrouping(t *testing.T) {
+	b := []byte("foo")
+	h1 := hash.New()
+	h2 := hash.New()
+	h2.SetSeed(h1.Seed())
+	h1.AddBytes(b)
+	for _, x := range b {
+		h2.AddByte(x)
+	}
+	if h1.Hash() != h2.Hash() {
+		t.Errorf("hash of \"foo\" and \"f\",\"o\",\"o\" not identical")
+	}
+}
+
+func TestHashBytesVsString(t *testing.T) {
+	s := "foo"
+	b := []byte(s)
+	h1 := hash.New()
+	h2 := hash.New()
+	h2.SetSeed(h1.Seed())
+	h1.AddString(s)
+	h2.AddBytes(b)
+	if h1.Hash() != h2.Hash() {
+		t.Errorf("hash of string and byts not identical")
+	}
+}
+
+// Make sure a Hash implements the hash.Hash and hash.Hash64 interfaces.
+var _ basehash.Hash = &hash.Hash{}
+var _ basehash.Hash64 = &hash.Hash{}