From 25c22854a1a052278aa4169c377d986eca499ecc Mon Sep 17 00:00:00 2001
From: Mark van der Velden <mark@dynom.nl>
Date: Fri, 13 Dec 2019 16:20:09 +0100
Subject: [PATCH] Adding methods for prefix filtering

---
 .golangci.toml                 |   2 +-
 finder/find.go                 |  99 ++++++++++--
 finder/find_benchmarks_test.go | 107 +++++++++++++
 finder/find_test.go            | 270 ++++++++++++++++++++++++---------
 finder/option.go               |   8 +-
 5 files changed, 397 insertions(+), 89 deletions(-)
 create mode 100644 finder/find_benchmarks_test.go

diff --git a/.golangci.toml b/.golangci.toml
index 39581f6..4d4f7a6 100644
--- a/.golangci.toml
+++ b/.golangci.toml
@@ -4,7 +4,7 @@
 
 [linters-settings]
     [linters-settings.gocyclo]
-        min-complexity = 10
+        min-complexity = 12
 
     [linters-settings.goconst]
         min-len = 2
diff --git a/finder/find.go b/finder/find.go
index 4c3a2b2..e8eeea9 100644
--- a/finder/find.go
+++ b/finder/find.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"errors"
 	"math"
+	"strings"
 	"sync"
 )
 
@@ -15,7 +16,7 @@ type Finder struct {
 	Alg             Algorithm
 	LengthTolerance float64 // A number between 0.0-1.0 (percentage) to allow for length miss-match, anything outside this is considered not similar. Set to 0 to disable.
 	lock            sync.RWMutex
-	enableBuckets   bool
+	bucketChars     uint // @todo figure out what (type of) bucket approach to take. Prefix or perhaps using an ngram/trie approach
 }
 
 // Errors
@@ -55,7 +56,8 @@ func (t *Finder) Refresh(list []string) {
 	for _, r := range list {
 		rm[r] = struct{}{}
 
-		if t.enableBuckets {
+		// @todo make the bucket prefix length configurable
+		if t.bucketChars > 0 {
 			l := rune(r[0])
 			if _, ok := rb[l]; !ok {
 				rb[l] = make([]string, 0, 128)
@@ -86,32 +88,88 @@ func (t *Finder) FindCtx(ctx context.Context, input string) (string, float64, bo
 
 // FindTopRankingCtx returns a list (of at least one element) of references with the same "best" score
 func (t *Finder) FindTopRankingCtx(ctx context.Context, input string) ([]string, float64, bool) {
+	r, s, e, _ := t.findTopRankingCtx(ctx, input, 0)
+	return r, s, e
+}
+
+// FindTopRankingPrefixCtx requires the references to have an exact prefix match on N characters of the input.
+// prefixLength cannot exceed length of input
+func (t *Finder) FindTopRankingPrefixCtx(ctx context.Context, input string, prefixLength uint) (list []string, exact bool, err error) {
+	list, _, exact, err = t.findTopRankingCtx(ctx, input, prefixLength)
+	return
+}
+
+// getRefList returns the appropriate list of references. getRefList does not deal with locks!
+func (t *Finder) getRefList(input string) []string {
+	r := rune(input[0])
+	if _, ok := t.referenceBucket[r]; ok {
+		return t.referenceBucket[r]
+	}
+
+	return t.reference
+}
+
+// GetMatchingPrefix returns up to max ref's, that start with the prefix argument
+func (t *Finder) GetMatchingPrefix(ctx context.Context, prefix string, max uint) ([]string, error) {
+
+	t.lock.RLock()
+	defer t.lock.RUnlock()
+
+	var (
+		list   = t.getRefList(prefix)
+		result = make([]string, 0, max)
+	)
+
+	for _, ref := range list {
+		select {
+		case <-ctx.Done():
+			return result, ctx.Err()
+		default:
+		}
+
+		if strings.HasPrefix(ref, prefix) {
+			result = append(result, ref)
+		}
+
+		if max > 0 && max == uint(len(result)) {
+			return result, nil
+		}
+	}
+
+	return result, nil
+}
+
+func (t *Finder) findTopRankingCtx(ctx context.Context, input string, prefixLength uint) ([]string, float64, bool, error) {
 	var hs = WorstScoreValue
 
+	if prefixLength > 0 && uint(len(input)) < prefixLength {
+		return []string{input}, WorstScoreValue, false, errors.New("prefix length exceeds input length")
+	}
+
 	t.lock.RLock()
 	defer t.lock.RUnlock()
 
 	// Exact matches
 	if _, exists := t.referenceMap[input]; exists || len(input) == 0 {
-		return []string{input}, BestScoreValue, true
+		return []string{input}, BestScoreValue, true, nil
 	}
 
-	var list []string
-	r := rune(input[0])
-	if l, ok := t.referenceBucket[r]; ok {
-		list = l
-	} else {
-		list = t.reference
-	}
+	var (
+		list      = t.getRefList(input)
+		sameScore = []string{input}
+	)
 
-	var sameScore = []string{input}
 	for _, ref := range list {
 		select {
 		case <-ctx.Done():
-			return []string{input}, WorstScoreValue, false
+			return []string{input}, WorstScoreValue, false, ctx.Err()
 		default:
 		}
 
+		if !meetsPrefixLengthMatch(prefixLength, input, ref) {
+			continue
+		}
+
 		// Test if the input length differs too much from the reference, making it an unlikely typo.
 		if !meetsLengthTolerance(t.LengthTolerance, input, ref) {
 			continue
@@ -126,7 +184,22 @@ func (t *Finder) FindTopRankingCtx(ctx context.Context, input string) ([]string,
 		}
 	}
 
-	return sameScore, hs, false
+	return sameScore, hs, false, nil
+}
+
+// meetsPrefixLengthMatch tests is the strings both match until the specified length. A 0 length returns true
+func meetsPrefixLengthMatch(length uint, input, reference string) bool {
+	if length > 0 {
+		if uint(len(reference)) < length {
+			return false
+		}
+
+		if pi := length - 1; input[0:pi] != reference[0:pi] {
+			return false
+		}
+	}
+
+	return true
 }
 
 // meetsLengthTolerance checks if the input meets the length tolerance criteria. The percentage is based on `input`
diff --git a/finder/find_benchmarks_test.go b/finder/find_benchmarks_test.go
new file mode 100644
index 0000000..90763cd
--- /dev/null
+++ b/finder/find_benchmarks_test.go
@@ -0,0 +1,107 @@
+package finder
+
+import (
+	"math"
+	"math/rand"
+	"testing"
+)
+
+// Preventing the compiler to inline
+var ceilA, ceilB int
+
+func BenchmarkCeilOrNoCeil(b *testing.B) {
+	inputLen := 64
+	threshold := 0.195
+	b.Run("No Ceil", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			ceilA = int((float64(inputLen) * threshold) + 0.555)
+		}
+	})
+
+	b.Run("Ceil", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			ceilB = int(math.Ceil(float64(inputLen) * threshold))
+		}
+	})
+
+	if ceilA != ceilB {
+		b.Errorf("Implementation failure, a:%d != b:%d", ceilA, ceilB)
+	}
+}
+
+func BenchmarkSliceOrMap(b *testing.B) {
+	// With sets of more than 20 elements, maps become more efficient. (Not including setup costs)
+	size := 20
+	var hashMap = make(map[int]int, size)
+	var list = make([]int, size)
+
+	for i := size - 1; i > 0; i-- {
+		hashMap[i] = i
+		list[i] = i
+	}
+
+	b.Run("Map", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			_ = hashMap[i]
+		}
+	})
+	b.Run("List", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			for _, v := range list {
+				_ = v
+			}
+		}
+	})
+}
+
+func BenchmarkFindWithBucket(b *testing.B) {
+	refs := generateRefs(1000, 20)
+	alg := NewJaroWinkler(.7, 4)
+
+	testRef := generateRef(20)
+	b.ReportAllocs()
+	b.Run("find with bucket", func(b *testing.B) {
+		f, _ := New(refs,
+			WithAlgorithm(alg),
+			WithLengthTolerance(0),
+			WithPrefixBuckets(false),
+		)
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			f.Find(testRef)
+		}
+	})
+
+	b.Run("find without bucket", func(b *testing.B) {
+		f, _ := New(refs,
+			WithAlgorithm(alg),
+			WithLengthTolerance(0),
+			WithPrefixBuckets(true),
+		)
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			f.Find(testRef)
+		}
+	})
+}
+
+func generateRefs(refNum, length uint64) []string {
+	refs := make([]string, refNum)
+	for i := uint64(0); i < refNum; i++ {
+		refs[i] = generateRef(length)
+	}
+
+	return refs
+}
+
+func generateRef(length uint64) string {
+	const alnum = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+
+	var b = make([]byte, length)
+	for i := uint64(0); i < length; i++ {
+		b[i] = alnum[rand.Intn(len(alnum))]
+	}
+	return string(b)
+}
diff --git a/finder/find_test.go b/finder/find_test.go
index 1b0efdc..8d34ae6 100644
--- a/finder/find_test.go
+++ b/finder/find_test.go
@@ -2,8 +2,7 @@ package finder
 
 import (
 	"context"
-	"math"
-	"math/rand"
+	"reflect"
 	"testing"
 	"time"
 )
@@ -152,102 +151,227 @@ func TestMeetsLengthTolerance(t *testing.T) {
 	}
 }
 
-// Preventing the compiler to inline
-var ceilA, ceilB int
+func TestFinder_FindTopRankingPrefixCtx(t *testing.T) {
+	refs := []string{
+		"abcdef",
+		"bcdef",
+	}
 
-func BenchmarkCeilOrNoCeil(b *testing.B) {
-	inputLen := 64
-	threshold := 0.195
-	b.Run("No Ceil", func(b *testing.B) {
-		for i := 0; i < b.N; i++ {
-			ceilA = int((float64(inputLen) * threshold) + 0.555)
-		}
-	})
+	type args struct {
+		input        string
+		prefixLength uint
+	}
+	tests := []struct {
+		name     string
+		args     args
+		wantList []string
+		wantErr  bool
+	}{
 
-	b.Run("Ceil", func(b *testing.B) {
-		for i := 0; i < b.N; i++ {
-			ceilB = int(math.Ceil(float64(inputLen) * threshold))
-		}
-	})
+		// match
+		{name: "prefix full size", args: args{input: "abcdef", prefixLength: 6}, wantList: refs[0:1]},
+		{name: "prefix partial", args: args{input: "abcdef", prefixLength: 2}, wantList: refs[0:1]},
+
+		// no match
+		{name: "prefix miss-match", args: args{input: "monkey", prefixLength: 6}, wantList: []string{"monkey"}},
 
-	if ceilA != ceilB {
-		b.Errorf("Implementation failure, a:%d != b:%d", ceilA, ceilB)
+		// errors
+		{wantErr: true, name: "len exceeds input", args: args{input: "abc", prefixLength: 6}},
 	}
-}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t1 *testing.T) {
+			finder, _ := New(refs, func(sug *Finder) {
+				sug.Alg = func(a, b string) float64 {
+					return 1
+				}
+			})
+
+			ctx := context.Background()
+
+			gotList, _, err := finder.FindTopRankingPrefixCtx(ctx, tt.args.input, tt.args.prefixLength)
+			if (err != nil) != tt.wantErr {
+				t1.Errorf("FindTopRankingPrefixCtx() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
 
-func BenchmarkSliceOrMap(b *testing.B) {
-	// With sets of more than 20 elements, maps become more efficient. (Not including setup costs)
-	size := 20
-	var hashMap = make(map[int]int, size)
-	var list = make([]int, size)
+			// On failure the input is returned.
+			if tt.wantErr {
+				tt.wantList = []string{tt.args.input}
+			}
+
+			if !reflect.DeepEqual(gotList, tt.wantList) {
+				t1.Errorf("FindTopRankingPrefixCtx() gotList = %v, want %v", gotList, tt.wantList)
+			}
+		})
+	}
+}
 
-	for i := size - 1; i > 0; i-- {
-		hashMap[i] = i
-		list[i] = i
+func TestFinder_RefreshWithBuckets(t *testing.T) {
+	refs := []string{
+		"aabb",
+		"aabbcc",
+		"aabbccdd",
+		"aabcd",
+
+		"bbcc",
+		"bbccdd",
+		"bbccddee",
+		"bbcde",
 	}
 
-	b.Run("Map", func(b *testing.B) {
-		for i := 0; i < b.N; i++ {
-			_ = hashMap[i]
+	finder, _ := New(
+		refs,
+		WithPrefixBuckets(true),
+		WithAlgorithm(func(a, b string) float64 {
+			return BestScoreValue
+		}),
+	)
+
+	t.Run("bucket size", func(t1 *testing.T) {
+		if bl := len(finder.referenceBucket); bl != 2 {
+			t.Errorf("Expecting two buckets got: %d, want: %d", bl, 2)
+
+			for chr := range finder.referenceBucket {
+				t.Logf("Bucket chars: %c", chr)
+			}
+
+			return
 		}
 	})
-	b.Run("List", func(b *testing.B) {
-		for i := 0; i < b.N; i++ {
-			for _, v := range list {
-				_ = v
+
+	t.Run("testing bucket contents", func(t *testing.T) {
+		if finder.bucketChars != 1 {
+			t.Errorf("Expecting only single rune buckets, instead it's %d", finder.bucketChars)
+			return
+		}
+
+		const want = "bbccddee"
+		list := finder.referenceBucket[rune(want[0])]
+		var match bool
+		for _, v := range list {
+			if v == want {
+				match = true
+				break
 			}
 		}
-	})
-}
 
-func BenchmarkFindWithBucket(b *testing.B) {
-	refs := generateRefs(1000, 20)
-	alg := NewJaroWinkler(.7, 4)
-
-	testRef := generateRef(20)
-	b.ReportAllocs()
-	b.Run("find with bucket", func(b *testing.B) {
-		f, _ := New(refs,
-			WithAlgorithm(alg),
-			WithLengthTolerance(0),
-			WithBuckets(false),
-		)
-
-		b.ResetTimer()
-		for i := 0; i < b.N; i++ {
-			f.Find(testRef)
+		if !match {
+			t.Errorf("Expected to find %q in the reference bucket", want)
 		}
 	})
 
-	b.Run("find without bucket", func(b *testing.B) {
-		f, _ := New(refs,
-			WithAlgorithm(alg),
-			WithLengthTolerance(0),
-			WithBuckets(true),
-		)
+	t.Run("testing bucket similarity", func(t *testing.T) {
+		if finder.bucketChars != 1 {
+			t.Errorf("Expecting only single rune buckets, instead it's %d", finder.bucketChars)
+			return
+		}
 
-		b.ResetTimer()
-		for i := 0; i < b.N; i++ {
-			f.Find(testRef)
+		input := "beer"
+		bucketRune := rune(input[0])
+
+		// making the test a bit more robust
+		var want = make([]string, 0)
+		for _, v := range refs {
+			if rune(v[0]) == bucketRune {
+				want = append(want, v)
+			}
+		}
+
+		// due to the very liberal "algorithm", anything matches, as long as the bucket prefix is respected
+		got, _, _, _ := finder.findTopRankingCtx(context.Background(), input, 0)
+		if !reflect.DeepEqual(got, want) {
+			t.Errorf("Expected the reference bucket to be %+v, instead got: %+v", want, got)
 		}
 	})
 }
 
-func generateRefs(refNum, length uint64) []string {
-	refs := make([]string, refNum)
-	for i := uint64(0); i < refNum; i++ {
-		refs[i] = generateRef(length)
+func TestFinder_GetMatchingPrefix(t *testing.T) {
+	refs := []string{
+		"ada lovelace",
+		"grace hopper",
+		"ida rhodes",
+		"sophie wilson",
+		"aminata sana congo",
+		"mary lou jepsen",
+		"shafi goldwasser",
+	}
+
+	type args struct {
+		prefix string
+		max    uint
 	}
+	tests := []struct {
+		name    string
+		args    args
+		want    []string
+		wantErr bool
+	}{
+		{name: "single", args: args{prefix: "a", max: 1}, want: []string{"ada lovelace"}},
+		{name: "multiple", args: args{prefix: "a", max: 2}, want: []string{"ada lovelace", "aminata sana congo"}},
+		{name: "no max == all", args: args{prefix: "a", max: 0}, want: []string{"ada lovelace", "aminata sana congo"}},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+
+			sug, _ := New(refs, WithAlgorithm(exampleAlgorithm))
+			got, err := sug.GetMatchingPrefix(context.Background(), tt.args.prefix, tt.args.max)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("GetMatchingPrefix() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if !reflect.DeepEqual(got, tt.want) {
+				t.Errorf("GetMatchingPrefix() got = %v, want %v", got, tt.want)
+			}
+		})
+	}
+
+	t.Run("Testing context cancellation", func(t *testing.T) {
+		ctx, cancel := context.WithCancel(context.Background())
+		cancel()
+
+		sug, _ := New(refs, WithPrefixBuckets(true), WithAlgorithm(exampleAlgorithm))
+		list, err := sug.GetMatchingPrefix(ctx, "a", 2)
 
-	return refs
+		if err == nil {
+			t.Errorf("GetMatchingPrefix() error = %v", err)
+		}
+
+		t.Logf("list: %+v", list)
+	})
 }
 
-func generateRef(length uint64) string {
-	const alnum = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
+func TestFinder_getRefList(t *testing.T) {
 
-	var b = make([]byte, length)
-	for i := uint64(0); i < length; i++ {
-		b[i] = alnum[rand.Intn(len(alnum))]
+	refs := []string{
+		"balloon",
+		"basketball",
+		"sea lion",
+		"celebration",
+		"sunshine",
+	}
+
+	tests := []struct {
+		name  string
+		input string
+		want  uint
+	}{
+		// input exists in ref list. With buckets enabled we should see 2 results starting with the same letter
+		{name: "Selecting bucket ref list", input: "balloon", want: 2},
+
+		// no match, the entire list should be returned
+		{name: "Selecting full ref list", input: "lion", want: 5},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			sug, err := New(refs, WithPrefixBuckets(true), WithAlgorithm(exampleAlgorithm))
+			if err != nil {
+				t.Errorf("b00m headshot %+v", err)
+			}
+
+			if got := sug.getRefList(tt.input); uint(len(got)) != tt.want {
+				t.Errorf("getRefList() = %v, want %v", got, tt.want)
+			}
+		})
 	}
-	return string(b)
 }
diff --git a/finder/option.go b/finder/option.go
index 50ebbb5..c711db6 100644
--- a/finder/option.go
+++ b/finder/option.go
@@ -19,8 +19,12 @@ func WithLengthTolerance(t float64) Option {
 	}
 }
 
-func WithBuckets(enable bool) Option {
+// WithPrefixBuckets splits the reference list into buckets by their first letter. Assuming the first letter is correct this
+// can significantly improve the performance by reducing the size of the list to scan through.
+func WithPrefixBuckets(enable bool) Option {
 	return func(s *Finder) {
-		s.enableBuckets = enable
+		if enable {
+			s.bucketChars = 1
+		}
 	}
 }