From 71822941e60db914628ef3a67ef47e9ab402f3d1 Mon Sep 17 00:00:00 2001 From: Hamish Date: Sat, 24 Nov 2018 20:47:49 -0800 Subject: [PATCH 1/2] Add fuzzy prefixing tree walking --- node.go | 30 ++++++++++ pkg/levenshtein/levenshtein.go | 76 +++++++++++++++++++++++++ pkg/levenshtein/levenshtein_test.go | 87 +++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 pkg/levenshtein/levenshtein.go create mode 100644 pkg/levenshtein/levenshtein_test.go diff --git a/node.go b/node.go index 7a065e7..8d00b91 100644 --- a/node.go +++ b/node.go @@ -3,6 +3,8 @@ package iradix import ( "bytes" "sort" + + "code.sajari.com/go-immutable-radix/pkg/levenshtein" ) // WalkFn is used when walking the tree. Takes a @@ -10,6 +12,10 @@ import ( // be terminated. type WalkFn func(k []byte, v interface{}) bool +// DistFn is used when walking the tree to adjust +// for the levenshtein difference in the path prefix +type DistFn func(v interface{}, distance int) interface{} + // leafNode is used to represent a value type leafNode struct { mutateCh chan struct{} @@ -242,6 +248,30 @@ func (n *Node) WalkPrefix(prefix []byte, fn WalkFn) { } } +// WalkPrefix is used to walk the tree under a prefix. The depth is the fuzzy +// Levenshtein distance tolerated during the recursive walk down the tree. +// This assumes the first edge char of the prefix is correct, otherwise this could be +// very, very slow. The DistFn is used to augment v, such that external consumers +// can modify the value of v before the WalkFn handles it +func (n *Node) WalkFuzzyPrefix(prefix []byte, depth int, dfn DistFn, fn WalkFn) { + for { + // Look for an edge (assume the first letter of the prefix is ok) + _, n = n.getEdge(prefix[0]) + if n == nil { + break + } + + recursiveWalk(n, func(k []byte, v interface{}) bool { + distance, ok := levenshtein.HasPrefix(k, prefix, depth) + if !ok { + return false + } + v = dfn(v, distance) + return fn(k, v) + }) + } +} + // WalkPath is used to walk the tree, but only visiting nodes // from the root down to a given leaf. Where WalkPrefix walks // all the entries *under* the given prefix, this walks the diff --git a/pkg/levenshtein/levenshtein.go b/pkg/levenshtein/levenshtein.go new file mode 100644 index 0000000..8a18674 --- /dev/null +++ b/pkg/levenshtein/levenshtein.go @@ -0,0 +1,76 @@ +package levenshtein + +import ( + "bytes" +) + +// newrun creates a run for computing +// distances to input string b. +func newrun(b []byte) *run { + br := []rune(string(b)) + return &run{ + b: br, + d: make([]int, len(br)+1), + } +} + +type run struct { + b []rune // should be the larger of the two comparisons + d []int +} + +// dist computes the Levenshtein distance from a to the runner string. +func (l *run) dist(a []byte) int { + d := l.d + b := l.b + + for j := range d { + d[j] = j + } + + for _, ca := range string(a) { + j := 1 + dj1 := d[0] + d[0]++ + for _, cb := range b { + mn := min(d[j]+1, d[j-1]+1) // delete & insert + if cb != ca { + mn = min(mn, dj1+1) // change + } else { + mn = min(mn, dj1) // matched + } + + dj1, d[j] = d[j], mn + j++ + } + } + + return d[len(d)-1] +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + +// HasPrefix is analogous to bytes.HasPrefix except the prefix +// can fuzzy match based on the Levenshtein distance threshold +// between s and prefix +func HasPrefix(s, prefix []byte, threshold int) (int, bool) { + l := len(prefix) + if l > len(s) { + return 0, false + } + if len(s) > l { + s = s[:l] + } + if bytes.Equal(s, prefix) { + return 0, true + } + r := newrun(s) + d := r.dist(prefix) + + return d, d <= threshold +} diff --git a/pkg/levenshtein/levenshtein_test.go b/pkg/levenshtein/levenshtein_test.go new file mode 100644 index 0000000..6abe517 --- /dev/null +++ b/pkg/levenshtein/levenshtein_test.go @@ -0,0 +1,87 @@ +package levenshtein + +import "testing" + +func TestHasPrefix(t *testing.T) { + type args struct { + s []byte + prefix []byte + threshold int + } + tests := []struct { + name string + args args + want bool + }{ + { + name: "exact", + args: args{ + s: []byte("exact"), + prefix: []byte("exact"), + threshold: 1, + }, + want: true, + }, + { + name: "depth 1, same len", + args: args{ + s: []byte("exect"), + prefix: []byte("exact"), + threshold: 1, + }, + want: true, + }, + { + name: "depth 2, same len, threshold 1", + args: args{ + s: []byte("exeet"), + prefix: []byte("exact"), + threshold: 1, + }, + want: false, + }, + { + name: "depth 3, same len, threshold 3", + args: args{ + s: []byte("eeeet"), + prefix: []byte("exact"), + threshold: 3, + }, + want: true, + }, + { + name: "short string, depth 1, same len, threshold 1", + args: args{ + s: []byte("eea"), + prefix: []byte("exa"), + threshold: 1, + }, + want: true, + }, + { + name: "depth 1, same len, threshold 1", + args: args{ + s: []byte("greee"), + prefix: []byte("greek"), + threshold: 1, + }, + want: true, + }, + { + name: "equal strings, same len, threshold 0", + args: args{ + s: []byte("equal"), + prefix: []byte("equal"), + threshold: 0, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if _, got := HasPrefix(tt.args.s, tt.args.prefix, tt.args.threshold); got != tt.want { + t.Errorf("HasPrefix() = %v, want %v", got, tt.want) + } + }) + } +} From 22c8a9b33467b4379d87a61db8d93100f59ee04c Mon Sep 17 00:00:00 2001 From: Hamish Date: Fri, 4 Oct 2019 18:45:33 +1000 Subject: [PATCH 2/2] dist function signature change --- node.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node.go b/node.go index 8d00b91..04cf224 100644 --- a/node.go +++ b/node.go @@ -14,7 +14,7 @@ type WalkFn func(k []byte, v interface{}) bool // DistFn is used when walking the tree to adjust // for the levenshtein difference in the path prefix -type DistFn func(v interface{}, distance int) interface{} +type DistFn func(k []byte, v interface{}, distance int) interface{} // leafNode is used to represent a value type leafNode struct { @@ -266,7 +266,7 @@ func (n *Node) WalkFuzzyPrefix(prefix []byte, depth int, dfn DistFn, fn WalkFn) if !ok { return false } - v = dfn(v, distance) + v = dfn(k, v, distance) return fn(k, v) }) }