Adding methods for prefix filtering

Dynom · Dec 13, 2019 · 25c2285 · 25c2285
1 parent 7773d83
commit 25c2285
Show file tree

Hide file tree

Showing 5 changed files with 397 additions and 89 deletions.
diff --git a/.golangci.toml b/.golangci.toml
@@ -4,7 +4,7 @@
 
 [linters-settings]
     [linters-settings.gocyclo]
-        min-complexity = 10
+        min-complexity = 12
 
     [linters-settings.goconst]
         min-len = 2

diff --git a/finder/find.go b/finder/find.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"math"
+	"strings"
 	"sync"
 )
 
@@ -15,7 +16,7 @@ type Finder struct {
 	Alg             Algorithm
 	LengthTolerance float64 // A number between 0.0-1.0 (percentage) to allow for length miss-match, anything outside this is considered not similar. Set to 0 to disable.
 	lock            sync.RWMutex
-	enableBuckets   bool
+	bucketChars     uint // @todo figure out what (type of) bucket approach to take. Prefix or perhaps using an ngram/trie approach
 }
 
 // Errors
@@ -55,7 +56,8 @@ func (t *Finder) Refresh(list []string) {
 	for _, r := range list {
 		rm[r] = struct{}{}
 
-		if t.enableBuckets {
+		// @todo make the bucket prefix length configurable
+		if t.bucketChars > 0 {
 			l := rune(r[0])
 			if _, ok := rb[l]; !ok {
 				rb[l] = make([]string, 0, 128)
@@ -86,32 +88,88 @@ func (t *Finder) FindCtx(ctx context.Context, input string) (string, float64, bo
 
 // FindTopRankingCtx returns a list (of at least one element) of references with the same "best" score
 func (t *Finder) FindTopRankingCtx(ctx context.Context, input string) ([]string, float64, bool) {
+	r, s, e, _ := t.findTopRankingCtx(ctx, input, 0)
+	return r, s, e
+}
+
+// FindTopRankingPrefixCtx requires the references to have an exact prefix match on N characters of the input.
+// prefixLength cannot exceed length of input
+func (t *Finder) FindTopRankingPrefixCtx(ctx context.Context, input string, prefixLength uint) (list []string, exact bool, err error) {
+	list, _, exact, err = t.findTopRankingCtx(ctx, input, prefixLength)
+	return
+}
+
+// getRefList returns the appropriate list of references. getRefList does not deal with locks!
+func (t *Finder) getRefList(input string) []string {
+	r := rune(input[0])
+	if _, ok := t.referenceBucket[r]; ok {
+		return t.referenceBucket[r]
+	}
+
+	return t.reference
+}
+
+// GetMatchingPrefix returns up to max ref's, that start with the prefix argument
+func (t *Finder) GetMatchingPrefix(ctx context.Context, prefix string, max uint) ([]string, error) {
+
+	t.lock.RLock()
+	defer t.lock.RUnlock()
+
+	var (
+		list   = t.getRefList(prefix)
+		result = make([]string, 0, max)
+	)
+
+	for _, ref := range list {
+		select {
+		case <-ctx.Done():
+			return result, ctx.Err()
+		default:
+		}
+
+		if strings.HasPrefix(ref, prefix) {
+			result = append(result, ref)
+		}
+
+		if max > 0 && max == uint(len(result)) {
+			return result, nil
+		}
+	}
+
+	return result, nil
+}
+
+func (t *Finder) findTopRankingCtx(ctx context.Context, input string, prefixLength uint) ([]string, float64, bool, error) {
 	var hs = WorstScoreValue
 
+	if prefixLength > 0 && uint(len(input)) < prefixLength {
+		return []string{input}, WorstScoreValue, false, errors.New("prefix length exceeds input length")
+	}
+
 	t.lock.RLock()
 	defer t.lock.RUnlock()
 
 	// Exact matches
 	if _, exists := t.referenceMap[input]; exists || len(input) == 0 {
-		return []string{input}, BestScoreValue, true
+		return []string{input}, BestScoreValue, true, nil
 	}
 
-	var list []string
-	r := rune(input[0])
-	if l, ok := t.referenceBucket[r]; ok {
-		list = l
-	} else {
-		list = t.reference
-	}
+	var (
+		list      = t.getRefList(input)
+		sameScore = []string{input}
+	)
 
-	var sameScore = []string{input}
 	for _, ref := range list {
 		select {
 		case <-ctx.Done():
-			return []string{input}, WorstScoreValue, false
+			return []string{input}, WorstScoreValue, false, ctx.Err()
 		default:
 		}
 
+		if !meetsPrefixLengthMatch(prefixLength, input, ref) {
+			continue
+		}
+
 		// Test if the input length differs too much from the reference, making it an unlikely typo.
 		if !meetsLengthTolerance(t.LengthTolerance, input, ref) {
 			continue
@@ -126,7 +184,22 @@ func (t *Finder) FindTopRankingCtx(ctx context.Context, input string) ([]string,
 		}
 	}
 
-	return sameScore, hs, false
+	return sameScore, hs, false, nil
+}
+
+// meetsPrefixLengthMatch tests is the strings both match until the specified length. A 0 length returns true
+func meetsPrefixLengthMatch(length uint, input, reference string) bool {
+	if length > 0 {
+		if uint(len(reference)) < length {
+			return false
+		}
+
+		if pi := length - 1; input[0:pi] != reference[0:pi] {
+			return false
+		}
+	}
+
+	return true
 }
 
 // meetsLengthTolerance checks if the input meets the length tolerance criteria. The percentage is based on `input`

diff --git a/finder/find_benchmarks_test.go b/finder/find_benchmarks_test.go
@@ -0,0 +1,107 @@
+package finder
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+)
+
+// Preventing the compiler to inline
+var ceilA, ceilB int
+
+func BenchmarkCeilOrNoCeil(b *testing.B) {
+	inputLen := 64
+	threshold := 0.195
+	b.Run("No Ceil", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			ceilA = int((float64(inputLen) * threshold) + 0.555)
+		}
+	})
+
+	b.Run("Ceil", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			ceilB = int(math.Ceil(float64(inputLen) * threshold))
+		}
+	})
+
+	if ceilA != ceilB {
+		b.Errorf("Implementation failure, a:%d != b:%d", ceilA, ceilB)
+	}
+}
+
+func BenchmarkSliceOrMap(b *testing.B) {
+	// With sets of more than 20 elements, maps become more efficient. (Not including setup costs)
+	size := 20
+	var hashMap = make(map[int]int, size)
+	var list = make([]int, size)
+
+	for i := size - 1; i > 0; i-- {
+		hashMap[i] = i
+		list[i] = i
+	}
+
+	b.Run("Map", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			_ = hashMap[i]
+		}
+	})
+	b.Run("List", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			for _, v := range list {
+				_ = v
+			}
+		}
+	})
+}
+
+func BenchmarkFindWithBucket(b *testing.B) {
+	refs := generateRefs(1000, 20)
+	alg := NewJaroWinkler(.7, 4)
+
+	testRef := generateRef(20)
+	b.ReportAllocs()
+	b.Run("find with bucket", func(b *testing.B) {
+		f, _ := New(refs,
+			WithAlgorithm(alg),
+			WithLengthTolerance(0),
+			WithPrefixBuckets(false),
+		)
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			f.Find(testRef)
+		}
+	})
+
+	b.Run("find without bucket", func(b *testing.B) {
+		f, _ := New(refs,
+			WithAlgorithm(alg),
+			WithLengthTolerance(0),
+			WithPrefixBuckets(true),
+		)
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			f.Find(testRef)
+		}
+	})
+}
+
+func generateRefs(refNum, length uint64) []string {
+	refs := make([]string, refNum)
+	for i := uint64(0); i < refNum; i++ {
+		refs[i] = generateRef(length)
+	}
+
+	return refs
+}
+
+func generateRef(length uint64) string {
+	const alnum = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+
+	var b = make([]byte, length)
+	for i := uint64(0); i < length; i++ {
+		b[i] = alnum[rand.Intn(len(alnum))]
+	}
+	return string(b)
+}