-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add minimal perfect hash domain matcher (#743)
* rename to HybridDomainMatcher & convert domain to lowercase * refactor code & add open hashing for rolling hash map * fix lint errors * update app/dns/dns.go * convert domain to lowercase in `strmatcher.go` * keep the original matcher behavior * add mph domain matcher & conver domain names to loweercase when matching * fix lint errors * fix lint errors
- Loading branch information
Showing
5 changed files
with
308 additions
and
104 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,297 @@ | ||
package strmatcher | ||
|
||
import ( | ||
"math/bits" | ||
"regexp" | ||
"sort" | ||
"strings" | ||
"unsafe" | ||
) | ||
|
||
// PrimeRK is the prime base used in Rabin-Karp algorithm. | ||
const PrimeRK = 16777619 | ||
|
||
// calculate the rolling murmurHash of given string | ||
func RollingHash(s string) uint32 { | ||
h := uint32(0) | ||
for i := len(s) - 1; i >= 0; i-- { | ||
h = h*PrimeRK + uint32(s[i]) | ||
} | ||
return h | ||
} | ||
|
||
// A MphMatcherGroup is divided into three parts: | ||
// 1. `full` and `domain` patterns are matched by Rabin-Karp algorithm and minimal perfect hash table; | ||
// 2. `substr` patterns are matched by ac automaton; | ||
// 3. `regex` patterns are matched with the regex library. | ||
type MphMatcherGroup struct { | ||
ac *ACAutomaton | ||
otherMatchers []matcherEntry | ||
rules []string | ||
level0 []uint32 | ||
level0Mask int | ||
level1 []uint32 | ||
level1Mask int | ||
count uint32 | ||
ruleMap *map[string]uint32 | ||
} | ||
|
||
func (g *MphMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) { | ||
h := RollingHash(pattern) | ||
switch t { | ||
case Domain: | ||
(*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.') | ||
fallthrough | ||
case Full: | ||
(*g.ruleMap)[pattern] = h | ||
default: | ||
} | ||
} | ||
|
||
func NewMphMatcherGroup() *MphMatcherGroup { | ||
return &MphMatcherGroup{ | ||
ac: nil, | ||
otherMatchers: nil, | ||
rules: nil, | ||
level0: nil, | ||
level0Mask: 0, | ||
level1: nil, | ||
level1Mask: 0, | ||
count: 1, | ||
ruleMap: &map[string]uint32{}, | ||
} | ||
} | ||
|
||
// AddPattern adds a pattern to MphMatcherGroup | ||
func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) { | ||
switch t { | ||
case Substr: | ||
if g.ac == nil { | ||
g.ac = NewACAutomaton() | ||
} | ||
g.ac.Add(pattern, t) | ||
case Full, Domain: | ||
pattern = strings.ToLower(pattern) | ||
g.AddFullOrDomainPattern(pattern, t) | ||
case Regex: | ||
r, err := regexp.Compile(pattern) | ||
if err != nil { | ||
return 0, err | ||
} | ||
g.otherMatchers = append(g.otherMatchers, matcherEntry{ | ||
m: ®exMatcher{pattern: r}, | ||
id: g.count, | ||
}) | ||
default: | ||
panic("Unknown type") | ||
} | ||
return g.count, nil | ||
} | ||
|
||
// Build builds a minimal perfect hash table and ac automaton from insert rules | ||
func (g *MphMatcherGroup) Build() { | ||
if g.ac != nil { | ||
g.ac.Build() | ||
} | ||
keyLen := len(*g.ruleMap) | ||
g.level0 = make([]uint32, nextPow2(keyLen/4)) | ||
g.level0Mask = len(g.level0) - 1 | ||
g.level1 = make([]uint32, nextPow2(keyLen)) | ||
g.level1Mask = len(g.level1) - 1 | ||
var sparseBuckets = make([][]int, len(g.level0)) | ||
var ruleIdx int | ||
for rule, hash := range *g.ruleMap { | ||
n := int(hash) & g.level0Mask | ||
g.rules = append(g.rules, rule) | ||
sparseBuckets[n] = append(sparseBuckets[n], ruleIdx) | ||
ruleIdx++ | ||
} | ||
g.ruleMap = nil | ||
var buckets []indexBucket | ||
for n, vals := range sparseBuckets { | ||
if len(vals) > 0 { | ||
buckets = append(buckets, indexBucket{n, vals}) | ||
} | ||
} | ||
sort.Sort(bySize(buckets)) | ||
|
||
occ := make([]bool, len(g.level1)) | ||
var tmpOcc []int | ||
for _, bucket := range buckets { | ||
var seed = uint32(0) | ||
for { | ||
findSeed := true | ||
tmpOcc = tmpOcc[:0] | ||
for _, i := range bucket.vals { | ||
n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask | ||
if occ[n] { | ||
for _, n := range tmpOcc { | ||
occ[n] = false | ||
} | ||
seed++ | ||
findSeed = false | ||
break | ||
} | ||
occ[n] = true | ||
tmpOcc = append(tmpOcc, n) | ||
g.level1[n] = uint32(i) | ||
} | ||
if findSeed { | ||
g.level0[bucket.n] = seed | ||
break | ||
} | ||
} | ||
} | ||
} | ||
|
||
func nextPow2(v int) int { | ||
if v <= 1 { | ||
return 1 | ||
} | ||
const MaxUInt = ^uint(0) | ||
n := (MaxUInt >> bits.LeadingZeros(uint(v))) + 1 | ||
return int(n) | ||
} | ||
|
||
// Lookup searches for s in t and returns its index and whether it was found. | ||
func (g *MphMatcherGroup) Lookup(h uint32, s string) bool { | ||
i0 := int(h) & g.level0Mask | ||
seed := g.level0[i0] | ||
i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask | ||
n := g.level1[i1] | ||
return s == g.rules[int(n)] | ||
} | ||
|
||
// Match implements IndexMatcher.Match. | ||
func (g *MphMatcherGroup) Match(pattern string) []uint32 { | ||
result := []uint32{} | ||
hash := uint32(0) | ||
for i := len(pattern) - 1; i >= 0; i-- { | ||
hash = hash*PrimeRK + uint32(pattern[i]) | ||
if pattern[i] == '.' { | ||
if g.Lookup(hash, pattern[i:]) { | ||
result = append(result, 1) | ||
return result | ||
} | ||
} | ||
} | ||
if g.Lookup(hash, pattern) { | ||
result = append(result, 1) | ||
return result | ||
} | ||
if g.ac != nil && g.ac.Match(pattern) { | ||
result = append(result, 1) | ||
return result | ||
} | ||
for _, e := range g.otherMatchers { | ||
if e.m.Match(pattern) { | ||
result = append(result, e.id) | ||
return result | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
type indexBucket struct { | ||
n int | ||
vals []int | ||
} | ||
|
||
type bySize []indexBucket | ||
|
||
func (s bySize) Len() int { return len(s) } | ||
func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) } | ||
func (s bySize) Swap(i, j int) { s[i], s[j] = s[j], s[i] } | ||
|
||
type stringStruct struct { | ||
str unsafe.Pointer | ||
len int | ||
} | ||
|
||
func strhashFallback(a unsafe.Pointer, h uintptr) uintptr { | ||
x := (*stringStruct)(a) | ||
return memhashFallback(x.str, h, uintptr(x.len)) | ||
} | ||
|
||
const ( | ||
// Constants for multiplication: four random odd 64-bit numbers. | ||
m1 = 16877499708836156737 | ||
m2 = 2820277070424839065 | ||
m3 = 9497967016996688599 | ||
m4 = 15839092249703872147 | ||
) | ||
|
||
var hashkey = [4]uintptr{1, 1, 1, 1} | ||
|
||
func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr { | ||
h := uint64(seed + s*hashkey[0]) | ||
tail: | ||
switch { | ||
case s == 0: | ||
case s < 4: | ||
h ^= uint64(*(*byte)(p)) | ||
h ^= uint64(*(*byte)(add(p, s>>1))) << 8 | ||
h ^= uint64(*(*byte)(add(p, s-1))) << 16 | ||
h = rotl31(h*m1) * m2 | ||
case s <= 8: | ||
h ^= uint64(readUnaligned32(p)) | ||
h ^= uint64(readUnaligned32(add(p, s-4))) << 32 | ||
h = rotl31(h*m1) * m2 | ||
case s <= 16: | ||
h ^= readUnaligned64(p) | ||
h = rotl31(h*m1) * m2 | ||
h ^= readUnaligned64(add(p, s-8)) | ||
h = rotl31(h*m1) * m2 | ||
case s <= 32: | ||
h ^= readUnaligned64(p) | ||
h = rotl31(h*m1) * m2 | ||
h ^= readUnaligned64(add(p, 8)) | ||
h = rotl31(h*m1) * m2 | ||
h ^= readUnaligned64(add(p, s-16)) | ||
h = rotl31(h*m1) * m2 | ||
h ^= readUnaligned64(add(p, s-8)) | ||
h = rotl31(h*m1) * m2 | ||
default: | ||
v1 := h | ||
v2 := uint64(seed * hashkey[1]) | ||
v3 := uint64(seed * hashkey[2]) | ||
v4 := uint64(seed * hashkey[3]) | ||
for s >= 32 { | ||
v1 ^= readUnaligned64(p) | ||
v1 = rotl31(v1*m1) * m2 | ||
p = add(p, 8) | ||
v2 ^= readUnaligned64(p) | ||
v2 = rotl31(v2*m2) * m3 | ||
p = add(p, 8) | ||
v3 ^= readUnaligned64(p) | ||
v3 = rotl31(v3*m3) * m4 | ||
p = add(p, 8) | ||
v4 ^= readUnaligned64(p) | ||
v4 = rotl31(v4*m4) * m1 | ||
p = add(p, 8) | ||
s -= 32 | ||
} | ||
h = v1 ^ v2 ^ v3 ^ v4 | ||
goto tail | ||
} | ||
|
||
h ^= h >> 29 | ||
h *= m3 | ||
h ^= h >> 32 | ||
return uintptr(h) | ||
} | ||
func add(p unsafe.Pointer, x uintptr) unsafe.Pointer { | ||
return unsafe.Pointer(uintptr(p) + x) | ||
} | ||
func readUnaligned32(p unsafe.Pointer) uint32 { | ||
q := (*[4]byte)(p) | ||
return uint32(q[0]) | uint32(q[1])<<8 | uint32(q[2])<<16 | uint32(q[3])<<24 | ||
} | ||
|
||
func rotl31(x uint64) uint64 { | ||
return (x << 31) | (x >> (64 - 31)) | ||
} | ||
func readUnaligned64(p unsafe.Pointer) uint64 { | ||
q := (*[8]byte)(p) | ||
return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56 | ||
} |
Oops, something went wrong.