-
Notifications
You must be signed in to change notification settings - Fork 0
/
ratcliffobershelp.go
76 lines (67 loc) · 1.87 KB
/
ratcliffobershelp.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package stringmetric
// longestCommonSequence calculates the longest common sequence of two byte arrays
// and returns a 3-column array which defines the length, row and column
func longestCommonSequence(a []byte, b []byte) [3]int {
rows := len(a) + 1
columns := len(b) + 1
m := make([][]int, rows)
for i := 0; i < rows; i++ {
m[i] = make([]int, columns)
}
lrc := [3]int{}
for i, vA := range a {
for j, vB := range b {
if vA == vB {
l := m[i][j] + 1
m[i+1][j+1] = l
if l > lrc[0] {
lrc[0] = l
lrc[1] = i + 1
lrc[2] = j + 1
}
}
}
}
return lrc
}
// commonSequences calculates all common sequences from two byte arrays.
// For instance, from the string "test1okay" and "test2okax" it should return
// a 2-row matrix, containing "test" and "oka" as common sequences.
func commonSequences(a []byte, b []byte) [][]byte {
lcs := longestCommonSequence(a, b)
if lcs[0] == 0 {
return [][]byte{}
}
a1Size := lcs[1] - lcs[0]
a1 := a[:a1Size]
a2Size := len(a) - lcs[1]
a2 := takeRight(a, a2Size)
b1Size := lcs[2] - lcs[0]
b1 := b[:b1Size]
b2Size := len(b) - lcs[2]
b2 := takeRight(b, b2Size)
var v [][]byte
r := a[a1Size:lcs[1]]
v = append(v, r)
v = append(v, commonSequences(a1, b1)...)
v = append(v, commonSequences(a2, b2)...)
return v
}
// RatcliffObershelpMetric calculates strings similarity from two strings
func RatcliffObershelpMetric(a string, b string) float64 {
return RatcliffObershelpMetric2([]byte(a), []byte(b))
}
// RatcliffObershelpMetric2 calculates strings similarity from two byte arrays (chars)
func RatcliffObershelpMetric2(a []byte, b []byte) float64 {
if a == nil || b == nil || len(a) == 0 || len(b) == 0 {
return float64(0)
}
if sameBytes(a, b) {
return float64(1)
}
totalSize := 0
for _, v := range commonSequences(a, b) {
totalSize += len(v)
}
return float64(2.0*totalSize) / float64(len(a)+len(b))
}