Skip to content

Commit

Permalink
Adding methods for prefix filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
Dynom committed Dec 13, 2019
1 parent 7773d83 commit 25c2285
Show file tree
Hide file tree
Showing 5 changed files with 397 additions and 89 deletions.
2 changes: 1 addition & 1 deletion .golangci.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

[linters-settings]
[linters-settings.gocyclo]
min-complexity = 10
min-complexity = 12

[linters-settings.goconst]
min-len = 2
Expand Down
99 changes: 86 additions & 13 deletions finder/find.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"errors"
"math"
"strings"
"sync"
)

Expand All @@ -15,7 +16,7 @@ type Finder struct {
Alg Algorithm
LengthTolerance float64 // A number between 0.0-1.0 (percentage) to allow for length miss-match, anything outside this is considered not similar. Set to 0 to disable.
lock sync.RWMutex
enableBuckets bool
bucketChars uint // @todo figure out what (type of) bucket approach to take. Prefix or perhaps using an ngram/trie approach
}

// Errors
Expand Down Expand Up @@ -55,7 +56,8 @@ func (t *Finder) Refresh(list []string) {
for _, r := range list {
rm[r] = struct{}{}

if t.enableBuckets {
// @todo make the bucket prefix length configurable
if t.bucketChars > 0 {
l := rune(r[0])
if _, ok := rb[l]; !ok {
rb[l] = make([]string, 0, 128)
Expand Down Expand Up @@ -86,32 +88,88 @@ func (t *Finder) FindCtx(ctx context.Context, input string) (string, float64, bo

// FindTopRankingCtx returns a list (of at least one element) of references with the same "best" score
func (t *Finder) FindTopRankingCtx(ctx context.Context, input string) ([]string, float64, bool) {
r, s, e, _ := t.findTopRankingCtx(ctx, input, 0)
return r, s, e
}

// FindTopRankingPrefixCtx requires the references to have an exact prefix match on N characters of the input.
// prefixLength cannot exceed length of input
func (t *Finder) FindTopRankingPrefixCtx(ctx context.Context, input string, prefixLength uint) (list []string, exact bool, err error) {
list, _, exact, err = t.findTopRankingCtx(ctx, input, prefixLength)
return
}

// getRefList returns the appropriate list of references. getRefList does not deal with locks!
func (t *Finder) getRefList(input string) []string {
r := rune(input[0])
if _, ok := t.referenceBucket[r]; ok {
return t.referenceBucket[r]
}

return t.reference
}

// GetMatchingPrefix returns up to max ref's, that start with the prefix argument
func (t *Finder) GetMatchingPrefix(ctx context.Context, prefix string, max uint) ([]string, error) {

t.lock.RLock()
defer t.lock.RUnlock()

var (
list = t.getRefList(prefix)
result = make([]string, 0, max)
)

for _, ref := range list {
select {
case <-ctx.Done():
return result, ctx.Err()
default:
}

if strings.HasPrefix(ref, prefix) {
result = append(result, ref)
}

if max > 0 && max == uint(len(result)) {
return result, nil
}
}

return result, nil
}

func (t *Finder) findTopRankingCtx(ctx context.Context, input string, prefixLength uint) ([]string, float64, bool, error) {
var hs = WorstScoreValue

if prefixLength > 0 && uint(len(input)) < prefixLength {
return []string{input}, WorstScoreValue, false, errors.New("prefix length exceeds input length")
}

t.lock.RLock()
defer t.lock.RUnlock()

// Exact matches
if _, exists := t.referenceMap[input]; exists || len(input) == 0 {
return []string{input}, BestScoreValue, true
return []string{input}, BestScoreValue, true, nil
}

var list []string
r := rune(input[0])
if l, ok := t.referenceBucket[r]; ok {
list = l
} else {
list = t.reference
}
var (
list = t.getRefList(input)
sameScore = []string{input}
)

var sameScore = []string{input}
for _, ref := range list {
select {
case <-ctx.Done():
return []string{input}, WorstScoreValue, false
return []string{input}, WorstScoreValue, false, ctx.Err()
default:
}

if !meetsPrefixLengthMatch(prefixLength, input, ref) {
continue
}

// Test if the input length differs too much from the reference, making it an unlikely typo.
if !meetsLengthTolerance(t.LengthTolerance, input, ref) {
continue
Expand All @@ -126,7 +184,22 @@ func (t *Finder) FindTopRankingCtx(ctx context.Context, input string) ([]string,
}
}

return sameScore, hs, false
return sameScore, hs, false, nil
}

// meetsPrefixLengthMatch tests is the strings both match until the specified length. A 0 length returns true
func meetsPrefixLengthMatch(length uint, input, reference string) bool {
if length > 0 {
if uint(len(reference)) < length {
return false
}

if pi := length - 1; input[0:pi] != reference[0:pi] {
return false
}
}

return true
}

// meetsLengthTolerance checks if the input meets the length tolerance criteria. The percentage is based on `input`
Expand Down
107 changes: 107 additions & 0 deletions finder/find_benchmarks_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package finder

import (
"math"
"math/rand"
"testing"
)

// Preventing the compiler to inline
var ceilA, ceilB int

func BenchmarkCeilOrNoCeil(b *testing.B) {
inputLen := 64
threshold := 0.195
b.Run("No Ceil", func(b *testing.B) {
for i := 0; i < b.N; i++ {
ceilA = int((float64(inputLen) * threshold) + 0.555)
}
})

b.Run("Ceil", func(b *testing.B) {
for i := 0; i < b.N; i++ {
ceilB = int(math.Ceil(float64(inputLen) * threshold))
}
})

if ceilA != ceilB {
b.Errorf("Implementation failure, a:%d != b:%d", ceilA, ceilB)
}
}

func BenchmarkSliceOrMap(b *testing.B) {
// With sets of more than 20 elements, maps become more efficient. (Not including setup costs)
size := 20
var hashMap = make(map[int]int, size)
var list = make([]int, size)

for i := size - 1; i > 0; i-- {
hashMap[i] = i
list[i] = i
}

b.Run("Map", func(b *testing.B) {
for i := 0; i < b.N; i++ {
_ = hashMap[i]
}
})
b.Run("List", func(b *testing.B) {
for i := 0; i < b.N; i++ {
for _, v := range list {
_ = v
}
}
})
}

func BenchmarkFindWithBucket(b *testing.B) {
refs := generateRefs(1000, 20)
alg := NewJaroWinkler(.7, 4)

testRef := generateRef(20)
b.ReportAllocs()
b.Run("find with bucket", func(b *testing.B) {
f, _ := New(refs,
WithAlgorithm(alg),
WithLengthTolerance(0),
WithPrefixBuckets(false),
)

b.ResetTimer()
for i := 0; i < b.N; i++ {
f.Find(testRef)
}
})

b.Run("find without bucket", func(b *testing.B) {
f, _ := New(refs,
WithAlgorithm(alg),
WithLengthTolerance(0),
WithPrefixBuckets(true),
)

b.ResetTimer()
for i := 0; i < b.N; i++ {
f.Find(testRef)
}
})
}

func generateRefs(refNum, length uint64) []string {
refs := make([]string, refNum)
for i := uint64(0); i < refNum; i++ {
refs[i] = generateRef(length)
}

return refs
}

func generateRef(length uint64) string {
const alnum = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"

var b = make([]byte, length)
for i := uint64(0); i < length; i++ {
b[i] = alnum[rand.Intn(len(alnum))]
}
return string(b)
}
Loading

0 comments on commit 25c2285

Please sign in to comment.