Skip to content

Commit

Permalink
[Fixes #2] Fix handling of unicode in ratio functions
Browse files Browse the repository at this point in the history
  • Loading branch information
paul-mannino committed Jan 27, 2020
1 parent 5ff4e14 commit 54652b1
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 18 deletions.
20 changes: 11 additions & 9 deletions fuzz.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ import (
"math"
"sort"
"strings"
"unicode/utf8"
)

// Ratio computes a score of how close two unicode strings are
// based on their Levenshtein edit distance.
// Returns an integer score [0,100], higher score indicates
// that strings are closer.
func Ratio(s1, s2 string) int {
return int(round(100 * floatRatio(s1, s2)))
return int(round(100 * floatRatio([]rune(s1), []rune(s2))))
}

// PartialRatio computes a score of how close a string is with
Expand All @@ -20,11 +21,12 @@ func Ratio(s1, s2 string) int {
// Returns an integer score [0,100], higher score indicates
// that the string and substring are closer.
func PartialRatio(s1, s2 string) int {
shorter, longer := s1, s2
if len(s1) > len(s2) {
longer, shorter = s1, s2
shorter, longer := []rune(s1), []rune(s2)
if len(shorter) > len(longer) {
longer, shorter = shorter, longer
}
matchingBlocks := getMatchingBlocks(shorter, longer)

bestScore := 0.0
for _, block := range matchingBlocks {
longStart := block.dpos - block.spos
Expand All @@ -35,7 +37,7 @@ func PartialRatio(s1, s2 string) int {
if longEnd > len(longer) {
longEnd = len(longer)
}
longSubStr := string([]rune(longer)[longStart:longEnd])
longSubStr := longer[longStart:longEnd]

r := floatRatio(shorter, longSubStr)
if r > .995 {
Expand All @@ -48,12 +50,12 @@ func PartialRatio(s1, s2 string) int {
return int(round(100 * bestScore))
}

func floatRatio(s1, s2 string) float64 {
lenSum := len(s1) + len(s2)
func floatRatio(chrs1, chrs2 []rune) float64 {
lenSum := len(chrs1) + len(chrs2)
if lenSum == 0 {
return 0.0
}
editDistance := LevEditDistance(s1, s2, 1)
editDistance := optimizedEditDistance(chrs1, chrs2, 1)
return float64(lenSum-editDistance) / float64(lenSum)
}

Expand Down Expand Up @@ -109,7 +111,7 @@ func weightedRatioHelper(s1, s2 string, asciiOnly bool) int {
unbaseScale := .95
partialScale := .9
baseScore := float64(Ratio(c1, c2))
lengthRatio := float64(len(c1)) / float64(len(c2))
lengthRatio := float64(utf8.RuneCountInString(c1)) / float64(utf8.RuneCountInString(c2))
if lengthRatio < 1 {
lengthRatio = 1 / lengthRatio
}
Expand Down
8 changes: 6 additions & 2 deletions fuzz_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ var games = []string{
}

var alphanumeric = []string{
"JOHNSMITH6211986", //0
"JOHNSMITH6201986", //1
"JOHNSMITH6211986", //0
"JOHNSMITH6201986", //1
}

var nonascii = []string{
Expand Down Expand Up @@ -65,6 +65,10 @@ func TestPartialRatio(t *testing.T) {
if r5 <= 75 {
t.Errorf("Expected Ratio of '%v' and '%v' to be greater than 75. Got %v", s1, s4, r5)
}

s5, s6 := "栶eeƵ画-ʏĜ橭畏p父«P^艎鹥ʭ攆", "eeǸɁ碳簫S晑=2#父«厄].稍咾靐Ë"
r6 := PartialRatio(s5, s6)
assertRatio(t, "Ratio", s5, s6, 21, r6)
}

func TestTokenSortRatio(t *testing.T) {
Expand Down
13 changes: 6 additions & 7 deletions levenshtein.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,7 @@ func shouldContinue(i int, editOps []levEditOp, opIdx int, editType levEditType,
editOps[opIdx].dpos == dpos && editOps[opIdx].spos == spos
}

func getMatchingBlocks(s1, s2 string) []levMatchingBlock {
chrs1, chrs2 := []rune(s1), []rune(s2)
func getMatchingBlocks(chrs1, chrs2 []rune) []levMatchingBlock {
len1, len2 := len(chrs1), len(chrs2)

return getMatchingBlocksHelper(len1, len2, findEditOpsHelper(chrs1, len1, chrs2, len2))
Expand Down Expand Up @@ -504,10 +503,10 @@ func editDistance(chrs1, chrs2 []rune, xcost int) int {
if len2 == 0 {
return len1
}
editMatrix := make([][]int, len1 + 1)

editMatrix := make([][]int, len1+1)
for i := range editMatrix {
editMatrix[i] = make([]int, len2 + 1)
editMatrix[i] = make([]int, len2+1)
}

for i := range editMatrix {
Expand All @@ -529,7 +528,7 @@ func editDistance(chrs1, chrs2 []rune, xcost int) int {
if c1 == c2 {
editMatrix[i+1][j+1] = min(editMatrix[i][j], editMatrix[i+1][j+1])
} else {
editMatrix[i+1][j+1] = min(editMatrix[i][j] + replaceCost, editMatrix[i+1][j+1])
editMatrix[i+1][j+1] = min(editMatrix[i][j]+replaceCost, editMatrix[i+1][j+1])
}
}
}
Expand All @@ -541,4 +540,4 @@ func min(a, b int) int {
return a
}
return b
}
}

0 comments on commit 54652b1

Please sign in to comment.