From 25c22854a1a052278aa4169c377d986eca499ecc Mon Sep 17 00:00:00 2001 From: Mark van der Velden Date: Fri, 13 Dec 2019 16:20:09 +0100 Subject: [PATCH] Adding methods for prefix filtering --- .golangci.toml | 2 +- finder/find.go | 99 ++++++++++-- finder/find_benchmarks_test.go | 107 +++++++++++++ finder/find_test.go | 270 ++++++++++++++++++++++++--------- finder/option.go | 8 +- 5 files changed, 397 insertions(+), 89 deletions(-) create mode 100644 finder/find_benchmarks_test.go diff --git a/.golangci.toml b/.golangci.toml index 39581f6..4d4f7a6 100644 --- a/.golangci.toml +++ b/.golangci.toml @@ -4,7 +4,7 @@ [linters-settings] [linters-settings.gocyclo] - min-complexity = 10 + min-complexity = 12 [linters-settings.goconst] min-len = 2 diff --git a/finder/find.go b/finder/find.go index 4c3a2b2..e8eeea9 100644 --- a/finder/find.go +++ b/finder/find.go @@ -4,6 +4,7 @@ import ( "context" "errors" "math" + "strings" "sync" ) @@ -15,7 +16,7 @@ type Finder struct { Alg Algorithm LengthTolerance float64 // A number between 0.0-1.0 (percentage) to allow for length miss-match, anything outside this is considered not similar. Set to 0 to disable. lock sync.RWMutex - enableBuckets bool + bucketChars uint // @todo figure out what (type of) bucket approach to take. Prefix or perhaps using an ngram/trie approach } // Errors @@ -55,7 +56,8 @@ func (t *Finder) Refresh(list []string) { for _, r := range list { rm[r] = struct{}{} - if t.enableBuckets { + // @todo make the bucket prefix length configurable + if t.bucketChars > 0 { l := rune(r[0]) if _, ok := rb[l]; !ok { rb[l] = make([]string, 0, 128) @@ -86,32 +88,88 @@ func (t *Finder) FindCtx(ctx context.Context, input string) (string, float64, bo // FindTopRankingCtx returns a list (of at least one element) of references with the same "best" score func (t *Finder) FindTopRankingCtx(ctx context.Context, input string) ([]string, float64, bool) { + r, s, e, _ := t.findTopRankingCtx(ctx, input, 0) + return r, s, e +} + +// FindTopRankingPrefixCtx requires the references to have an exact prefix match on N characters of the input. +// prefixLength cannot exceed length of input +func (t *Finder) FindTopRankingPrefixCtx(ctx context.Context, input string, prefixLength uint) (list []string, exact bool, err error) { + list, _, exact, err = t.findTopRankingCtx(ctx, input, prefixLength) + return +} + +// getRefList returns the appropriate list of references. getRefList does not deal with locks! +func (t *Finder) getRefList(input string) []string { + r := rune(input[0]) + if _, ok := t.referenceBucket[r]; ok { + return t.referenceBucket[r] + } + + return t.reference +} + +// GetMatchingPrefix returns up to max ref's, that start with the prefix argument +func (t *Finder) GetMatchingPrefix(ctx context.Context, prefix string, max uint) ([]string, error) { + + t.lock.RLock() + defer t.lock.RUnlock() + + var ( + list = t.getRefList(prefix) + result = make([]string, 0, max) + ) + + for _, ref := range list { + select { + case <-ctx.Done(): + return result, ctx.Err() + default: + } + + if strings.HasPrefix(ref, prefix) { + result = append(result, ref) + } + + if max > 0 && max == uint(len(result)) { + return result, nil + } + } + + return result, nil +} + +func (t *Finder) findTopRankingCtx(ctx context.Context, input string, prefixLength uint) ([]string, float64, bool, error) { var hs = WorstScoreValue + if prefixLength > 0 && uint(len(input)) < prefixLength { + return []string{input}, WorstScoreValue, false, errors.New("prefix length exceeds input length") + } + t.lock.RLock() defer t.lock.RUnlock() // Exact matches if _, exists := t.referenceMap[input]; exists || len(input) == 0 { - return []string{input}, BestScoreValue, true + return []string{input}, BestScoreValue, true, nil } - var list []string - r := rune(input[0]) - if l, ok := t.referenceBucket[r]; ok { - list = l - } else { - list = t.reference - } + var ( + list = t.getRefList(input) + sameScore = []string{input} + ) - var sameScore = []string{input} for _, ref := range list { select { case <-ctx.Done(): - return []string{input}, WorstScoreValue, false + return []string{input}, WorstScoreValue, false, ctx.Err() default: } + if !meetsPrefixLengthMatch(prefixLength, input, ref) { + continue + } + // Test if the input length differs too much from the reference, making it an unlikely typo. if !meetsLengthTolerance(t.LengthTolerance, input, ref) { continue @@ -126,7 +184,22 @@ func (t *Finder) FindTopRankingCtx(ctx context.Context, input string) ([]string, } } - return sameScore, hs, false + return sameScore, hs, false, nil +} + +// meetsPrefixLengthMatch tests is the strings both match until the specified length. A 0 length returns true +func meetsPrefixLengthMatch(length uint, input, reference string) bool { + if length > 0 { + if uint(len(reference)) < length { + return false + } + + if pi := length - 1; input[0:pi] != reference[0:pi] { + return false + } + } + + return true } // meetsLengthTolerance checks if the input meets the length tolerance criteria. The percentage is based on `input` diff --git a/finder/find_benchmarks_test.go b/finder/find_benchmarks_test.go new file mode 100644 index 0000000..90763cd --- /dev/null +++ b/finder/find_benchmarks_test.go @@ -0,0 +1,107 @@ +package finder + +import ( + "math" + "math/rand" + "testing" +) + +// Preventing the compiler to inline +var ceilA, ceilB int + +func BenchmarkCeilOrNoCeil(b *testing.B) { + inputLen := 64 + threshold := 0.195 + b.Run("No Ceil", func(b *testing.B) { + for i := 0; i < b.N; i++ { + ceilA = int((float64(inputLen) * threshold) + 0.555) + } + }) + + b.Run("Ceil", func(b *testing.B) { + for i := 0; i < b.N; i++ { + ceilB = int(math.Ceil(float64(inputLen) * threshold)) + } + }) + + if ceilA != ceilB { + b.Errorf("Implementation failure, a:%d != b:%d", ceilA, ceilB) + } +} + +func BenchmarkSliceOrMap(b *testing.B) { + // With sets of more than 20 elements, maps become more efficient. (Not including setup costs) + size := 20 + var hashMap = make(map[int]int, size) + var list = make([]int, size) + + for i := size - 1; i > 0; i-- { + hashMap[i] = i + list[i] = i + } + + b.Run("Map", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = hashMap[i] + } + }) + b.Run("List", func(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, v := range list { + _ = v + } + } + }) +} + +func BenchmarkFindWithBucket(b *testing.B) { + refs := generateRefs(1000, 20) + alg := NewJaroWinkler(.7, 4) + + testRef := generateRef(20) + b.ReportAllocs() + b.Run("find with bucket", func(b *testing.B) { + f, _ := New(refs, + WithAlgorithm(alg), + WithLengthTolerance(0), + WithPrefixBuckets(false), + ) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + f.Find(testRef) + } + }) + + b.Run("find without bucket", func(b *testing.B) { + f, _ := New(refs, + WithAlgorithm(alg), + WithLengthTolerance(0), + WithPrefixBuckets(true), + ) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + f.Find(testRef) + } + }) +} + +func generateRefs(refNum, length uint64) []string { + refs := make([]string, refNum) + for i := uint64(0); i < refNum; i++ { + refs[i] = generateRef(length) + } + + return refs +} + +func generateRef(length uint64) string { + const alnum = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + + var b = make([]byte, length) + for i := uint64(0); i < length; i++ { + b[i] = alnum[rand.Intn(len(alnum))] + } + return string(b) +} diff --git a/finder/find_test.go b/finder/find_test.go index 1b0efdc..8d34ae6 100644 --- a/finder/find_test.go +++ b/finder/find_test.go @@ -2,8 +2,7 @@ package finder import ( "context" - "math" - "math/rand" + "reflect" "testing" "time" ) @@ -152,102 +151,227 @@ func TestMeetsLengthTolerance(t *testing.T) { } } -// Preventing the compiler to inline -var ceilA, ceilB int +func TestFinder_FindTopRankingPrefixCtx(t *testing.T) { + refs := []string{ + "abcdef", + "bcdef", + } -func BenchmarkCeilOrNoCeil(b *testing.B) { - inputLen := 64 - threshold := 0.195 - b.Run("No Ceil", func(b *testing.B) { - for i := 0; i < b.N; i++ { - ceilA = int((float64(inputLen) * threshold) + 0.555) - } - }) + type args struct { + input string + prefixLength uint + } + tests := []struct { + name string + args args + wantList []string + wantErr bool + }{ - b.Run("Ceil", func(b *testing.B) { - for i := 0; i < b.N; i++ { - ceilB = int(math.Ceil(float64(inputLen) * threshold)) - } - }) + // match + {name: "prefix full size", args: args{input: "abcdef", prefixLength: 6}, wantList: refs[0:1]}, + {name: "prefix partial", args: args{input: "abcdef", prefixLength: 2}, wantList: refs[0:1]}, + + // no match + {name: "prefix miss-match", args: args{input: "monkey", prefixLength: 6}, wantList: []string{"monkey"}}, - if ceilA != ceilB { - b.Errorf("Implementation failure, a:%d != b:%d", ceilA, ceilB) + // errors + {wantErr: true, name: "len exceeds input", args: args{input: "abc", prefixLength: 6}}, } -} + for _, tt := range tests { + t.Run(tt.name, func(t1 *testing.T) { + finder, _ := New(refs, func(sug *Finder) { + sug.Alg = func(a, b string) float64 { + return 1 + } + }) + + ctx := context.Background() + + gotList, _, err := finder.FindTopRankingPrefixCtx(ctx, tt.args.input, tt.args.prefixLength) + if (err != nil) != tt.wantErr { + t1.Errorf("FindTopRankingPrefixCtx() error = %v, wantErr %v", err, tt.wantErr) + return + } -func BenchmarkSliceOrMap(b *testing.B) { - // With sets of more than 20 elements, maps become more efficient. (Not including setup costs) - size := 20 - var hashMap = make(map[int]int, size) - var list = make([]int, size) + // On failure the input is returned. + if tt.wantErr { + tt.wantList = []string{tt.args.input} + } + + if !reflect.DeepEqual(gotList, tt.wantList) { + t1.Errorf("FindTopRankingPrefixCtx() gotList = %v, want %v", gotList, tt.wantList) + } + }) + } +} - for i := size - 1; i > 0; i-- { - hashMap[i] = i - list[i] = i +func TestFinder_RefreshWithBuckets(t *testing.T) { + refs := []string{ + "aabb", + "aabbcc", + "aabbccdd", + "aabcd", + + "bbcc", + "bbccdd", + "bbccddee", + "bbcde", } - b.Run("Map", func(b *testing.B) { - for i := 0; i < b.N; i++ { - _ = hashMap[i] + finder, _ := New( + refs, + WithPrefixBuckets(true), + WithAlgorithm(func(a, b string) float64 { + return BestScoreValue + }), + ) + + t.Run("bucket size", func(t1 *testing.T) { + if bl := len(finder.referenceBucket); bl != 2 { + t.Errorf("Expecting two buckets got: %d, want: %d", bl, 2) + + for chr := range finder.referenceBucket { + t.Logf("Bucket chars: %c", chr) + } + + return } }) - b.Run("List", func(b *testing.B) { - for i := 0; i < b.N; i++ { - for _, v := range list { - _ = v + + t.Run("testing bucket contents", func(t *testing.T) { + if finder.bucketChars != 1 { + t.Errorf("Expecting only single rune buckets, instead it's %d", finder.bucketChars) + return + } + + const want = "bbccddee" + list := finder.referenceBucket[rune(want[0])] + var match bool + for _, v := range list { + if v == want { + match = true + break } } - }) -} -func BenchmarkFindWithBucket(b *testing.B) { - refs := generateRefs(1000, 20) - alg := NewJaroWinkler(.7, 4) - - testRef := generateRef(20) - b.ReportAllocs() - b.Run("find with bucket", func(b *testing.B) { - f, _ := New(refs, - WithAlgorithm(alg), - WithLengthTolerance(0), - WithBuckets(false), - ) - - b.ResetTimer() - for i := 0; i < b.N; i++ { - f.Find(testRef) + if !match { + t.Errorf("Expected to find %q in the reference bucket", want) } }) - b.Run("find without bucket", func(b *testing.B) { - f, _ := New(refs, - WithAlgorithm(alg), - WithLengthTolerance(0), - WithBuckets(true), - ) + t.Run("testing bucket similarity", func(t *testing.T) { + if finder.bucketChars != 1 { + t.Errorf("Expecting only single rune buckets, instead it's %d", finder.bucketChars) + return + } - b.ResetTimer() - for i := 0; i < b.N; i++ { - f.Find(testRef) + input := "beer" + bucketRune := rune(input[0]) + + // making the test a bit more robust + var want = make([]string, 0) + for _, v := range refs { + if rune(v[0]) == bucketRune { + want = append(want, v) + } + } + + // due to the very liberal "algorithm", anything matches, as long as the bucket prefix is respected + got, _, _, _ := finder.findTopRankingCtx(context.Background(), input, 0) + if !reflect.DeepEqual(got, want) { + t.Errorf("Expected the reference bucket to be %+v, instead got: %+v", want, got) } }) } -func generateRefs(refNum, length uint64) []string { - refs := make([]string, refNum) - for i := uint64(0); i < refNum; i++ { - refs[i] = generateRef(length) +func TestFinder_GetMatchingPrefix(t *testing.T) { + refs := []string{ + "ada lovelace", + "grace hopper", + "ida rhodes", + "sophie wilson", + "aminata sana congo", + "mary lou jepsen", + "shafi goldwasser", + } + + type args struct { + prefix string + max uint } + tests := []struct { + name string + args args + want []string + wantErr bool + }{ + {name: "single", args: args{prefix: "a", max: 1}, want: []string{"ada lovelace"}}, + {name: "multiple", args: args{prefix: "a", max: 2}, want: []string{"ada lovelace", "aminata sana congo"}}, + {name: "no max == all", args: args{prefix: "a", max: 0}, want: []string{"ada lovelace", "aminata sana congo"}}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + sug, _ := New(refs, WithAlgorithm(exampleAlgorithm)) + got, err := sug.GetMatchingPrefix(context.Background(), tt.args.prefix, tt.args.max) + if (err != nil) != tt.wantErr { + t.Errorf("GetMatchingPrefix() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("GetMatchingPrefix() got = %v, want %v", got, tt.want) + } + }) + } + + t.Run("Testing context cancellation", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + sug, _ := New(refs, WithPrefixBuckets(true), WithAlgorithm(exampleAlgorithm)) + list, err := sug.GetMatchingPrefix(ctx, "a", 2) - return refs + if err == nil { + t.Errorf("GetMatchingPrefix() error = %v", err) + } + + t.Logf("list: %+v", list) + }) } -func generateRef(length uint64) string { - const alnum = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" +func TestFinder_getRefList(t *testing.T) { - var b = make([]byte, length) - for i := uint64(0); i < length; i++ { - b[i] = alnum[rand.Intn(len(alnum))] + refs := []string{ + "balloon", + "basketball", + "sea lion", + "celebration", + "sunshine", + } + + tests := []struct { + name string + input string + want uint + }{ + // input exists in ref list. With buckets enabled we should see 2 results starting with the same letter + {name: "Selecting bucket ref list", input: "balloon", want: 2}, + + // no match, the entire list should be returned + {name: "Selecting full ref list", input: "lion", want: 5}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sug, err := New(refs, WithPrefixBuckets(true), WithAlgorithm(exampleAlgorithm)) + if err != nil { + t.Errorf("b00m headshot %+v", err) + } + + if got := sug.getRefList(tt.input); uint(len(got)) != tt.want { + t.Errorf("getRefList() = %v, want %v", got, tt.want) + } + }) } - return string(b) } diff --git a/finder/option.go b/finder/option.go index 50ebbb5..c711db6 100644 --- a/finder/option.go +++ b/finder/option.go @@ -19,8 +19,12 @@ func WithLengthTolerance(t float64) Option { } } -func WithBuckets(enable bool) Option { +// WithPrefixBuckets splits the reference list into buckets by their first letter. Assuming the first letter is correct this +// can significantly improve the performance by reducing the size of the list to scan through. +func WithPrefixBuckets(enable bool) Option { return func(s *Finder) { - s.enableBuckets = enable + if enable { + s.bucketChars = 1 + } } }