Skip to content

Commit

Permalink
Fix Levenshtein distance calculation with match function. (#4545)
Browse files Browse the repository at this point in the history
Fixes #4494.

We try to be smart about calculating the Levenshtein distance by short-circuiting the calculation based on the max-allowed distance, but that ends up returning incorrect distances.

* Remove max argument from levenshteinDistance.
* Update tests.
  • Loading branch information
danielmai authored Jan 14, 2020
1 parent 5aa3a12 commit ef2757c
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 21 deletions.
15 changes: 11 additions & 4 deletions systest/queries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -600,7 +600,7 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) {
in, out, failure string
}{
{
in: `{q(func:match(term, drive, 8)) {term}}`,
in: `{q(func:match(term, drive, 0)) {term}}`,
out: `{"q":[{"term":"drive"}]}`,
},
{
Expand Down Expand Up @@ -661,12 +661,19 @@ func FuzzyMatch(t *testing.T, c *dgo.Dgraph) {
{
in: `{q(func:match(term, "carigeway", 8)) {term}}`,
out: `{"q":[
{"term": "dual carriageway"}
{"term": "highway"},
{"term": "motorway"},
{"term": "dual carriageway"},
{"term": "pathway"},
{"term": "parkway"}
]}`,
},
{
in: `{q(func:match(term, "carigeway", 4)) {term}}`,
out: `{"q":[]}`,
in: `{q(func:match(term, "carigeway", 4)) {term}}`,
out: `{"q":[
{"term": "highway"},
{"term": "parkway"}
]}`,
},
{
in: `{q(func:match(term, "dualway", 8)) {term}}`,
Expand Down
11 changes: 2 additions & 9 deletions worker/match.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import (
// This implemention is optimized to use O(min(m,n)) space and is based on the
// optimized C version found here:
// http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance#C
func levenshteinDistance(s, t string, max int) int {
func levenshteinDistance(s, t string) int {
if len(s) > len(t) {
s, t = t, s
}
Expand All @@ -43,7 +43,6 @@ func levenshteinDistance(s, t string, max int) int {
column[y] = y
}

var minIdx int
for x := 1; x <= len(r2); x++ {
column[0] = x

Expand All @@ -56,12 +55,6 @@ func levenshteinDistance(s, t string, max int) int {
column[y] = min(column[y]+1, column[y-1]+1, lastDiag+cost)
lastDiag = oldDiag
}
if minIdx < len(r1) && column[minIdx] > column[minIdx+1] {
minIdx++
}
if column[minIdx] > max {
return column[minIdx]
}
}
return column[len(r1)]
}
Expand All @@ -81,7 +74,7 @@ func matchFuzzy(query, val string, max int) bool {
if val == "" {
return false
}
return levenshteinDistance(val, query, max) <= max
return levenshteinDistance(val, query) <= max
}

// uidsForMatch collects a list of uids that "might" match a fuzzy term based on the ngram
Expand Down
18 changes: 10 additions & 8 deletions worker/match_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@ import (
)

func TestDistance(t *testing.T) {
require.Equal(t, 0, levenshteinDistance("detour", "detour", 2))
require.Equal(t, 1, levenshteinDistance("detour", "det.our", 2))
require.Equal(t, 2, levenshteinDistance("detour", "det..our", 2))
require.Equal(t, 3, levenshteinDistance("detour", "..det..our", 2))
require.Equal(t, 2, levenshteinDistance("detour", "detour..", 2))
require.Equal(t, 3, levenshteinDistance("detour", "detour...", 2))
require.Equal(t, 3, levenshteinDistance("detour", "...detour", 2))
require.Equal(t, 3, levenshteinDistance("detour", "..detour.", 2))
require.Equal(t, 0, levenshteinDistance("detour", "detour"))
require.Equal(t, 1, levenshteinDistance("detour", "det.our"))
require.Equal(t, 2, levenshteinDistance("detour", "det..our"))
require.Equal(t, 4, levenshteinDistance("detour", "..det..our"))
require.Equal(t, 2, levenshteinDistance("detour", "detour.."))
require.Equal(t, 3, levenshteinDistance("detour", "detour..."))
require.Equal(t, 3, levenshteinDistance("detour", "...detour"))
require.Equal(t, 3, levenshteinDistance("detour", "..detour."))
require.Equal(t, 1, levenshteinDistance("detour", "detoar"))
require.Equal(t, 6, levenshteinDistance("detour", "DETOUR"))
}

0 comments on commit ef2757c

Please sign in to comment.