matchfinder: add MultiHash

andybalholm · Jan 9, 2024 · 578645e · 578645e
1 parent 24b2bfa
commit 578645e
Show file tree

Hide file tree

Showing 2 changed files with 251 additions and 0 deletions.
diff --git a/brotli_test.go b/brotli_test.go
@@ -693,3 +693,27 @@ func BenchmarkEncodeM4Chain64(b *testing.B) {
 func BenchmarkEncodeM4Chain128(b *testing.B) {
 	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 128, HashLen: 5, Score: matchScore}, 1<<16)
 }
+
+func TestEncodeMultiHash6(t *testing.T) {
+	test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, Score: matchScore, HashLengths: []int{6}}, 1<<16)
+}
+
+func TestEncodeMultiHash6_8(t *testing.T) {
+	test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, Score: matchScore, HashLengths: []int{6, 8}}, 1<<16)
+}
+
+func BenchmarkEncodeMultiHash6(b *testing.B) {
+	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, Score: matchScore, HashLengths: []int{6}}, 1<<16)
+}
+
+func BenchmarkEncodeMultiHash5_8(b *testing.B) {
+	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, Score: matchScore, HashLengths: []int{5, 8}}, 1<<16)
+}
+
+func BenchmarkEncodeMultiHash5_7_9(b *testing.B) {
+	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, Score: matchScore, HashLengths: []int{5, 7, 9}}, 1<<16)
+}
+
+func BenchmarkEncodeMultiHash5_6_7_9(b *testing.B) {
+	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, Score: matchScore, HashLengths: []int{5, 6, 7, 9}}, 1<<16)
+}
diff --git a/matchfinder/multihash.go b/matchfinder/multihash.go
@@ -0,0 +1,227 @@
+package matchfinder
+
+import (
+	"encoding/binary"
+	"sort"
+)
+
+// MultiHash is an implementation of the MatchFinder
+// interface that uses multiple hashes of different lengths.
+type MultiHash struct {
+	// MaxDistance is the maximum distance (in bytes) to look back for
+	// a match. The default is 65535.
+	MaxDistance int
+
+	// MinLength is the length of the shortest match to return.
+	// The default is 4.
+	MinLength int
+
+	// HashLengths is a list of the hashes to use, with the number of
+	// bytes to use for each. For example, to to use 4-byte, 7-byte, and
+	// 10-byte hashes, set HashLengths to []int{4, 7, 10}.
+	// The minimum length is 4.
+	HashLengths []int
+
+	// TableBits is the number of bits in the hash table indexes.
+	// The default is 17 (128K entries).
+	TableBits int
+
+	// Score is the rating function used to choose the best match.
+	// The default is the length of the match.
+	Score func(AbsoluteMatch) int
+
+	tables [][]uint32
+
+	history []byte
+}
+
+func (q *MultiHash) Reset() {
+	for _, t := range q.tables {
+		for i := range t {
+			t[i] = 0
+		}
+	}
+	q.history = q.history[:0]
+}
+
+func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match {
+	if q.MaxDistance == 0 {
+		q.MaxDistance = 65535
+	}
+	if q.MinLength == 0 {
+		q.MinLength = 4
+	}
+	if q.TableBits == 0 {
+		q.TableBits = 17
+	}
+	if len(q.tables) < len(q.HashLengths) {
+		q.tables = make([][]uint32, len(q.HashLengths))
+		for i := range q.tables {
+			q.tables[i] = make([]uint32, 1<<q.TableBits)
+		}
+	}
+	if q.Score == nil {
+		q.Score = func(m AbsoluteMatch) int {
+			return m.End - m.Start
+		}
+	}
+	sort.Ints(q.HashLengths)
+	maxHashLen := q.HashLengths[len(q.HashLengths)-1]
+
+	e := matchEmitter{Dst: dst}
+
+	if len(q.history) > q.MaxDistance*2 {
+		// Trim down the history buffer.
+		delta := len(q.history) - q.MaxDistance
+		copy(q.history, q.history[delta:])
+		q.history = q.history[:q.MaxDistance]
+
+		for _, t := range q.tables {
+			for i, v := range t {
+				newV := int(v) - delta
+				if newV < 0 {
+					newV = 0
+				}
+				t[i] = uint32(newV)
+			}
+		}
+	}
+
+	// Append src to the history buffer.
+	e.NextEmit = len(q.history)
+	q.history = append(q.history, src...)
+	src = q.history
+
+	// matches stores the matches that have been found but not emitted,
+	// in reverse order. (matches[0] is the most recent one.)
+	var matches [3]AbsoluteMatch
+
+	candidates := make([]int, len(q.HashLengths))
+
+	for i := e.NextEmit; i < len(src)-maxHashLen; i++ {
+		if matches[0] != (AbsoluteMatch{}) && i >= matches[0].End {
+			// We have found some matches, and we're far enough along that we probably
+			// won't find overlapping matches, so we might as well emit them.
+			if matches[1] != (AbsoluteMatch{}) {
+				e.trim(matches[1], matches[0].Start, q.MinLength)
+			}
+			e.emit(matches[0])
+			matches = [3]AbsoluteMatch{}
+		}
+
+		// Calculate and store the hashes.
+		h := uint32(0x811c9dc5) // FNV-32 offset basis
+		nb := 0
+		for j, hashLen := range q.HashLengths {
+			for nb < hashLen {
+				h ^= uint32(src[i+nb])
+				h *= 0x01000193 // FNV-32 prime
+				nb++
+			}
+			index := h >> (32 - q.TableBits)
+			candidates[j] = int(q.tables[j][index])
+			q.tables[j][index] = uint32(i)
+		}
+
+		// Look for a match.
+		var currentMatch AbsoluteMatch
+
+		if i < matches[0].End {
+			// If we're looking for an overlapping match, we only need to check the
+			// hash that ends 2 bytes after the end of the previous match.
+			for j, candidate := range candidates {
+				if i+q.HashLengths[j] != matches[0].End+2 {
+					continue
+				}
+				if candidate == 0 || i-candidate > q.MaxDistance {
+					break
+				}
+				if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) {
+					break
+				}
+				m := extendMatch2(src, i, candidate, e.NextEmit)
+				if m.End-m.Start >= q.HashLengths[j] {
+					currentMatch = m
+				}
+			}
+		} else {
+			for j, candidate := range candidates {
+				if candidate == 0 || i-candidate > q.MaxDistance {
+					break
+				}
+				if i-candidate == matches[0].Start-matches[0].Match {
+					// Don't bother to check for the same match we already have.
+					continue
+				}
+				if currentMatch.End-currentMatch.Start > q.HashLengths[j] {
+					// Don't bother with hashes that are shorter than the current match.
+					continue
+				}
+				if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) {
+					break
+				}
+				m := extendMatch2(src, i, candidate, e.NextEmit)
+				if m.End-m.Start > q.MinLength && q.Score(m) > q.Score(currentMatch) {
+					currentMatch = m
+				}
+			}
+		}
+
+		if currentMatch == (AbsoluteMatch{}) || q.Score(currentMatch) <= q.Score(matches[0]) {
+			continue
+		}
+
+		matches = [3]AbsoluteMatch{
+			currentMatch,
+			matches[0],
+			matches[1],
+		}
+
+		if matches[2] == (AbsoluteMatch{}) {
+			continue
+		}
+
+		// We have three matches, so it's time to emit one and/or eliminate one.
+		switch {
+		case matches[0].Start < matches[2].End:
+			// The first and third matches overlap; discard the one in between.
+			matches = [3]AbsoluteMatch{
+				matches[0],
+				matches[2],
+				AbsoluteMatch{},
+			}
+
+		case matches[0].Start < matches[2].End+q.MinLength:
+			// The first and third matches don't overlap, but there's no room for
+			// another match between them. Emit the first match and discard the second.
+			e.emit(matches[2])
+			matches = [3]AbsoluteMatch{
+				matches[0],
+				AbsoluteMatch{},
+				AbsoluteMatch{},
+			}
+
+		default:
+			// Emit the first match, shortening it if necessary to avoid overlap with the second.
+			e.trim(matches[2], matches[1].Start, q.MinLength)
+			matches[2] = AbsoluteMatch{}
+		}
+	}
+
+	// We've found all the matches now; emit the remaining ones.
+	if matches[1] != (AbsoluteMatch{}) {
+		e.trim(matches[1], matches[0].Start, q.MinLength)
+	}
+	if matches[0] != (AbsoluteMatch{}) {
+		e.emit(matches[0])
+	}
+
+	dst = e.Dst
+	if e.NextEmit < len(src) {
+		dst = append(dst, Match{
+			Unmatched: len(src) - e.NextEmit,
+		})
+	}
+
+	return dst
+}