Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fuzzy prefixing tree walking #23

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions node.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,19 @@ package iradix
import (
"bytes"
"sort"

"code.sajari.com/go-immutable-radix/pkg/levenshtein"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a goimports fail - shouldn’t it be importing the levenstein sub package also added here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"code.sajari.com/go-immutable-radix/pkg/levenshtein"
"github.com/hashicorp/go-immutable-radix/pkg/levenshtein"

Yeah this is my bad. I'd had a few beers and sent the PR a bit prematurely by accident.

)

// WalkFn is used when walking the tree. Takes a
// key and value, returning if iteration should
// be terminated.
type WalkFn func(k []byte, v interface{}) bool

// DistFn is used when walking the tree to adjust
// for the levenshtein difference in the path prefix
type DistFn func(v interface{}, distance int) interface{}

// leafNode is used to represent a value
type leafNode struct {
mutateCh chan struct{}
Expand Down Expand Up @@ -242,6 +248,30 @@ func (n *Node) WalkPrefix(prefix []byte, fn WalkFn) {
}
}

// WalkPrefix is used to walk the tree under a prefix. The depth is the fuzzy
// Levenshtein distance tolerated during the recursive walk down the tree.
// This assumes the first edge char of the prefix is correct, otherwise this could be
// very, very slow. The DistFn is used to augment v, such that external consumers
// can modify the value of v before the WalkFn handles it
func (n *Node) WalkFuzzyPrefix(prefix []byte, depth int, dfn DistFn, fn WalkFn) {
for {
// Look for an edge (assume the first letter of the prefix is ok)
_, n = n.getEdge(prefix[0])
if n == nil {
break
}

recursiveWalk(n, func(k []byte, v interface{}) bool {
distance, ok := levenshtein.HasPrefix(k, prefix, depth)
if !ok {
return false
}
v = dfn(v, distance)
return fn(k, v)
})
}
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code looks good to me at a glance but test cases to show the behaviour and edge cases seem important!

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah i'm wondering how best to do this? The surface changes are pretty simple. Most of the complexity is in the levenshtein pkg, which is where i focused on the tests. Anyhow happy to add, let me know your thoughts?

// WalkPath is used to walk the tree, but only visiting nodes
// from the root down to a given leaf. Where WalkPrefix walks
// all the entries *under* the given prefix, this walks the
Expand Down
76 changes: 76 additions & 0 deletions pkg/levenshtein/levenshtein.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package levenshtein

import (
"bytes"
)

// newrun creates a run for computing
// distances to input string b.
func newrun(b []byte) *run {
br := []rune(string(b))
return &run{
b: br,
d: make([]int, len(br)+1),
}
}

type run struct {
b []rune // should be the larger of the two comparisons
d []int
}

// dist computes the Levenshtein distance from a to the runner string.
func (l *run) dist(a []byte) int {
d := l.d
b := l.b

for j := range d {
d[j] = j
}

for _, ca := range string(a) {
j := 1
dj1 := d[0]
d[0]++
for _, cb := range b {
mn := min(d[j]+1, d[j-1]+1) // delete & insert
if cb != ca {
mn = min(mn, dj1+1) // change
} else {
mn = min(mn, dj1) // matched
}

dj1, d[j] = d[j], mn
j++
}
}

return d[len(d)-1]
}

func min(a, b int) int {
if a <= b {
return a
}
return b
}

// HasPrefix is analogous to bytes.HasPrefix except the prefix
// can fuzzy match based on the Levenshtein distance threshold
// between s and prefix
func HasPrefix(s, prefix []byte, threshold int) (int, bool) {
l := len(prefix)
if l > len(s) {
return 0, false
}
if len(s) > l {
s = s[:l]
}
if bytes.Equal(s, prefix) {
return 0, true
}
r := newrun(s)
d := r.dist(prefix)

return d, d <= threshold
}
87 changes: 87 additions & 0 deletions pkg/levenshtein/levenshtein_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package levenshtein

import "testing"

func TestHasPrefix(t *testing.T) {
type args struct {
s []byte
prefix []byte
threshold int
}
tests := []struct {
name string
args args
want bool
}{
{
name: "exact",
args: args{
s: []byte("exact"),
prefix: []byte("exact"),
threshold: 1,
},
want: true,
},
{
name: "depth 1, same len",
args: args{
s: []byte("exect"),
prefix: []byte("exact"),
threshold: 1,
},
want: true,
},
{
name: "depth 2, same len, threshold 1",
args: args{
s: []byte("exeet"),
prefix: []byte("exact"),
threshold: 1,
},
want: false,
},
{
name: "depth 3, same len, threshold 3",
args: args{
s: []byte("eeeet"),
prefix: []byte("exact"),
threshold: 3,
},
want: true,
},
{
name: "short string, depth 1, same len, threshold 1",
args: args{
s: []byte("eea"),
prefix: []byte("exa"),
threshold: 1,
},
want: true,
},
{
name: "depth 1, same len, threshold 1",
args: args{
s: []byte("greee"),
prefix: []byte("greek"),
threshold: 1,
},
want: true,
},
{
name: "equal strings, same len, threshold 0",
args: args{
s: []byte("equal"),
prefix: []byte("equal"),
threshold: 0,
},
want: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if _, got := HasPrefix(tt.args.s, tt.args.prefix, tt.args.threshold); got != tt.want {
t.Errorf("HasPrefix() = %v, want %v", got, tt.want)
}
})
}
}