-
Notifications
You must be signed in to change notification settings - Fork 55
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
24b2bfa
commit 578645e
Showing
2 changed files
with
251 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,227 @@ | ||
package matchfinder | ||
|
||
import ( | ||
"encoding/binary" | ||
"sort" | ||
) | ||
|
||
// MultiHash is an implementation of the MatchFinder | ||
// interface that uses multiple hashes of different lengths. | ||
type MultiHash struct { | ||
// MaxDistance is the maximum distance (in bytes) to look back for | ||
// a match. The default is 65535. | ||
MaxDistance int | ||
|
||
// MinLength is the length of the shortest match to return. | ||
// The default is 4. | ||
MinLength int | ||
|
||
// HashLengths is a list of the hashes to use, with the number of | ||
// bytes to use for each. For example, to to use 4-byte, 7-byte, and | ||
// 10-byte hashes, set HashLengths to []int{4, 7, 10}. | ||
// The minimum length is 4. | ||
HashLengths []int | ||
|
||
// TableBits is the number of bits in the hash table indexes. | ||
// The default is 17 (128K entries). | ||
TableBits int | ||
|
||
// Score is the rating function used to choose the best match. | ||
// The default is the length of the match. | ||
Score func(AbsoluteMatch) int | ||
|
||
tables [][]uint32 | ||
|
||
history []byte | ||
} | ||
|
||
func (q *MultiHash) Reset() { | ||
for _, t := range q.tables { | ||
for i := range t { | ||
t[i] = 0 | ||
} | ||
} | ||
q.history = q.history[:0] | ||
} | ||
|
||
func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match { | ||
if q.MaxDistance == 0 { | ||
q.MaxDistance = 65535 | ||
} | ||
if q.MinLength == 0 { | ||
q.MinLength = 4 | ||
} | ||
if q.TableBits == 0 { | ||
q.TableBits = 17 | ||
} | ||
if len(q.tables) < len(q.HashLengths) { | ||
q.tables = make([][]uint32, len(q.HashLengths)) | ||
for i := range q.tables { | ||
q.tables[i] = make([]uint32, 1<<q.TableBits) | ||
} | ||
} | ||
if q.Score == nil { | ||
q.Score = func(m AbsoluteMatch) int { | ||
return m.End - m.Start | ||
} | ||
} | ||
sort.Ints(q.HashLengths) | ||
maxHashLen := q.HashLengths[len(q.HashLengths)-1] | ||
|
||
e := matchEmitter{Dst: dst} | ||
|
||
if len(q.history) > q.MaxDistance*2 { | ||
// Trim down the history buffer. | ||
delta := len(q.history) - q.MaxDistance | ||
copy(q.history, q.history[delta:]) | ||
q.history = q.history[:q.MaxDistance] | ||
|
||
for _, t := range q.tables { | ||
for i, v := range t { | ||
newV := int(v) - delta | ||
if newV < 0 { | ||
newV = 0 | ||
} | ||
t[i] = uint32(newV) | ||
} | ||
} | ||
} | ||
|
||
// Append src to the history buffer. | ||
e.NextEmit = len(q.history) | ||
q.history = append(q.history, src...) | ||
src = q.history | ||
|
||
// matches stores the matches that have been found but not emitted, | ||
// in reverse order. (matches[0] is the most recent one.) | ||
var matches [3]AbsoluteMatch | ||
|
||
candidates := make([]int, len(q.HashLengths)) | ||
|
||
for i := e.NextEmit; i < len(src)-maxHashLen; i++ { | ||
if matches[0] != (AbsoluteMatch{}) && i >= matches[0].End { | ||
// We have found some matches, and we're far enough along that we probably | ||
// won't find overlapping matches, so we might as well emit them. | ||
if matches[1] != (AbsoluteMatch{}) { | ||
e.trim(matches[1], matches[0].Start, q.MinLength) | ||
} | ||
e.emit(matches[0]) | ||
matches = [3]AbsoluteMatch{} | ||
} | ||
|
||
// Calculate and store the hashes. | ||
h := uint32(0x811c9dc5) // FNV-32 offset basis | ||
nb := 0 | ||
for j, hashLen := range q.HashLengths { | ||
for nb < hashLen { | ||
h ^= uint32(src[i+nb]) | ||
h *= 0x01000193 // FNV-32 prime | ||
nb++ | ||
} | ||
index := h >> (32 - q.TableBits) | ||
candidates[j] = int(q.tables[j][index]) | ||
q.tables[j][index] = uint32(i) | ||
} | ||
|
||
// Look for a match. | ||
var currentMatch AbsoluteMatch | ||
|
||
if i < matches[0].End { | ||
// If we're looking for an overlapping match, we only need to check the | ||
// hash that ends 2 bytes after the end of the previous match. | ||
for j, candidate := range candidates { | ||
if i+q.HashLengths[j] != matches[0].End+2 { | ||
continue | ||
} | ||
if candidate == 0 || i-candidate > q.MaxDistance { | ||
break | ||
} | ||
if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) { | ||
break | ||
} | ||
m := extendMatch2(src, i, candidate, e.NextEmit) | ||
if m.End-m.Start >= q.HashLengths[j] { | ||
currentMatch = m | ||
} | ||
} | ||
} else { | ||
for j, candidate := range candidates { | ||
if candidate == 0 || i-candidate > q.MaxDistance { | ||
break | ||
} | ||
if i-candidate == matches[0].Start-matches[0].Match { | ||
// Don't bother to check for the same match we already have. | ||
continue | ||
} | ||
if currentMatch.End-currentMatch.Start > q.HashLengths[j] { | ||
// Don't bother with hashes that are shorter than the current match. | ||
continue | ||
} | ||
if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) { | ||
break | ||
} | ||
m := extendMatch2(src, i, candidate, e.NextEmit) | ||
if m.End-m.Start > q.MinLength && q.Score(m) > q.Score(currentMatch) { | ||
currentMatch = m | ||
} | ||
} | ||
} | ||
|
||
if currentMatch == (AbsoluteMatch{}) || q.Score(currentMatch) <= q.Score(matches[0]) { | ||
continue | ||
} | ||
|
||
matches = [3]AbsoluteMatch{ | ||
currentMatch, | ||
matches[0], | ||
matches[1], | ||
} | ||
|
||
if matches[2] == (AbsoluteMatch{}) { | ||
continue | ||
} | ||
|
||
// We have three matches, so it's time to emit one and/or eliminate one. | ||
switch { | ||
case matches[0].Start < matches[2].End: | ||
// The first and third matches overlap; discard the one in between. | ||
matches = [3]AbsoluteMatch{ | ||
matches[0], | ||
matches[2], | ||
AbsoluteMatch{}, | ||
} | ||
|
||
case matches[0].Start < matches[2].End+q.MinLength: | ||
// The first and third matches don't overlap, but there's no room for | ||
// another match between them. Emit the first match and discard the second. | ||
e.emit(matches[2]) | ||
matches = [3]AbsoluteMatch{ | ||
matches[0], | ||
AbsoluteMatch{}, | ||
AbsoluteMatch{}, | ||
} | ||
|
||
default: | ||
// Emit the first match, shortening it if necessary to avoid overlap with the second. | ||
e.trim(matches[2], matches[1].Start, q.MinLength) | ||
matches[2] = AbsoluteMatch{} | ||
} | ||
} | ||
|
||
// We've found all the matches now; emit the remaining ones. | ||
if matches[1] != (AbsoluteMatch{}) { | ||
e.trim(matches[1], matches[0].Start, q.MinLength) | ||
} | ||
if matches[0] != (AbsoluteMatch{}) { | ||
e.emit(matches[0]) | ||
} | ||
|
||
dst = e.Dst | ||
if e.NextEmit < len(src) { | ||
dst = append(dst, Match{ | ||
Unmatched: len(src) - e.NextEmit, | ||
}) | ||
} | ||
|
||
return dst | ||
} |