-
Notifications
You must be signed in to change notification settings - Fork 77
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add fuzzy prefixing tree walking #23
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,13 +3,19 @@ package iradix | |
import ( | ||
"bytes" | ||
"sort" | ||
|
||
"code.sajari.com/go-immutable-radix/pkg/levenshtein" | ||
) | ||
|
||
// WalkFn is used when walking the tree. Takes a | ||
// key and value, returning if iteration should | ||
// be terminated. | ||
type WalkFn func(k []byte, v interface{}) bool | ||
|
||
// DistFn is used when walking the tree to adjust | ||
// for the levenshtein difference in the path prefix | ||
type DistFn func(v interface{}, distance int) interface{} | ||
|
||
// leafNode is used to represent a value | ||
type leafNode struct { | ||
mutateCh chan struct{} | ||
|
@@ -242,6 +248,30 @@ func (n *Node) WalkPrefix(prefix []byte, fn WalkFn) { | |
} | ||
} | ||
|
||
// WalkPrefix is used to walk the tree under a prefix. The depth is the fuzzy | ||
// Levenshtein distance tolerated during the recursive walk down the tree. | ||
// This assumes the first edge char of the prefix is correct, otherwise this could be | ||
// very, very slow. The DistFn is used to augment v, such that external consumers | ||
// can modify the value of v before the WalkFn handles it | ||
func (n *Node) WalkFuzzyPrefix(prefix []byte, depth int, dfn DistFn, fn WalkFn) { | ||
for { | ||
// Look for an edge (assume the first letter of the prefix is ok) | ||
_, n = n.getEdge(prefix[0]) | ||
if n == nil { | ||
break | ||
} | ||
|
||
recursiveWalk(n, func(k []byte, v interface{}) bool { | ||
distance, ok := levenshtein.HasPrefix(k, prefix, depth) | ||
if !ok { | ||
return false | ||
} | ||
v = dfn(v, distance) | ||
return fn(k, v) | ||
}) | ||
} | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code looks good to me at a glance but test cases to show the behaviour and edge cases seem important! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah i'm wondering how best to do this? The surface changes are pretty simple. Most of the complexity is in the levenshtein pkg, which is where i focused on the tests. Anyhow happy to add, let me know your thoughts? |
||
// WalkPath is used to walk the tree, but only visiting nodes | ||
// from the root down to a given leaf. Where WalkPrefix walks | ||
// all the entries *under* the given prefix, this walks the | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
package levenshtein | ||
|
||
import ( | ||
"bytes" | ||
) | ||
|
||
// newrun creates a run for computing | ||
// distances to input string b. | ||
func newrun(b []byte) *run { | ||
br := []rune(string(b)) | ||
return &run{ | ||
b: br, | ||
d: make([]int, len(br)+1), | ||
} | ||
} | ||
|
||
type run struct { | ||
b []rune // should be the larger of the two comparisons | ||
d []int | ||
} | ||
|
||
// dist computes the Levenshtein distance from a to the runner string. | ||
func (l *run) dist(a []byte) int { | ||
d := l.d | ||
b := l.b | ||
|
||
for j := range d { | ||
d[j] = j | ||
} | ||
|
||
for _, ca := range string(a) { | ||
j := 1 | ||
dj1 := d[0] | ||
d[0]++ | ||
for _, cb := range b { | ||
mn := min(d[j]+1, d[j-1]+1) // delete & insert | ||
if cb != ca { | ||
mn = min(mn, dj1+1) // change | ||
} else { | ||
mn = min(mn, dj1) // matched | ||
} | ||
|
||
dj1, d[j] = d[j], mn | ||
j++ | ||
} | ||
} | ||
|
||
return d[len(d)-1] | ||
} | ||
|
||
func min(a, b int) int { | ||
if a <= b { | ||
return a | ||
} | ||
return b | ||
} | ||
|
||
// HasPrefix is analogous to bytes.HasPrefix except the prefix | ||
// can fuzzy match based on the Levenshtein distance threshold | ||
// between s and prefix | ||
func HasPrefix(s, prefix []byte, threshold int) (int, bool) { | ||
l := len(prefix) | ||
if l > len(s) { | ||
return 0, false | ||
} | ||
if len(s) > l { | ||
s = s[:l] | ||
} | ||
if bytes.Equal(s, prefix) { | ||
return 0, true | ||
} | ||
r := newrun(s) | ||
d := r.dist(prefix) | ||
|
||
return d, d <= threshold | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
package levenshtein | ||
|
||
import "testing" | ||
|
||
func TestHasPrefix(t *testing.T) { | ||
type args struct { | ||
s []byte | ||
prefix []byte | ||
threshold int | ||
} | ||
tests := []struct { | ||
name string | ||
args args | ||
want bool | ||
}{ | ||
{ | ||
name: "exact", | ||
args: args{ | ||
s: []byte("exact"), | ||
prefix: []byte("exact"), | ||
threshold: 1, | ||
}, | ||
want: true, | ||
}, | ||
{ | ||
name: "depth 1, same len", | ||
args: args{ | ||
s: []byte("exect"), | ||
prefix: []byte("exact"), | ||
threshold: 1, | ||
}, | ||
want: true, | ||
}, | ||
{ | ||
name: "depth 2, same len, threshold 1", | ||
args: args{ | ||
s: []byte("exeet"), | ||
prefix: []byte("exact"), | ||
threshold: 1, | ||
}, | ||
want: false, | ||
}, | ||
{ | ||
name: "depth 3, same len, threshold 3", | ||
args: args{ | ||
s: []byte("eeeet"), | ||
prefix: []byte("exact"), | ||
threshold: 3, | ||
}, | ||
want: true, | ||
}, | ||
{ | ||
name: "short string, depth 1, same len, threshold 1", | ||
args: args{ | ||
s: []byte("eea"), | ||
prefix: []byte("exa"), | ||
threshold: 1, | ||
}, | ||
want: true, | ||
}, | ||
{ | ||
name: "depth 1, same len, threshold 1", | ||
args: args{ | ||
s: []byte("greee"), | ||
prefix: []byte("greek"), | ||
threshold: 1, | ||
}, | ||
want: true, | ||
}, | ||
{ | ||
name: "equal strings, same len, threshold 0", | ||
args: args{ | ||
s: []byte("equal"), | ||
prefix: []byte("equal"), | ||
threshold: 0, | ||
}, | ||
want: true, | ||
}, | ||
} | ||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
if _, got := HasPrefix(tt.args.s, tt.args.prefix, tt.args.threshold); got != tt.want { | ||
t.Errorf("HasPrefix() = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like a goimports fail - shouldn’t it be importing the levenstein sub package also added here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah this is my bad. I'd had a few beers and sent the PR a bit prematurely by accident.