Skip to content

fix: use indivisible line hashes #141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 54 additions & 31 deletions diffmatchpatch/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ import (
"unicode/utf8"
)

// LineMap is a mapping from a line hash to its text.
type LineMap map[rune]string

// Operation defines the operation of a diff item.
type Operation int8

Expand All @@ -34,8 +37,6 @@ const (
DiffInsert Operation = 1
// DiffEqual item represents an equal diff.
DiffEqual Operation = 0
//IndexSeparator is used to seperate the array indexes in an index string
IndexSeparator = ","
)

// Diff represents one diff operation
Expand Down Expand Up @@ -83,12 +84,16 @@ func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff {

// DiffMain finds the differences between two texts.
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
//
// Note: if checklines is true, the limitation noted in DiffLinesToChars applies
func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff {
return dmp.DiffMainRunes([]rune(text1), []rune(text2), checklines)
}

// DiffMainRunes finds the differences between two rune sequences.
// If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
//
// Note: if checklines is true, the limitation noted in DiffLinesToRunes applies
func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff {
var deadline time.Time
if dmp.DiffTimeout > 0 {
Expand Down Expand Up @@ -391,29 +396,34 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int,

// DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line.
// It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes.
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) {
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
return chars1, chars2, lineArray
//
// Note: since we hash lines to runes, there is an upper limit to the number of
// unique lines this algorithm can handle. That limit is 1,112,063 unique
// lines.
func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, LineMap) {
chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
return chars1, chars2, lineMap
}

// DiffLinesToRunes splits two texts into a list of runes.
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
return []rune(chars1), []rune(chars2), lineArray
//
// Note: since we hash lines to runes, there is an upper limit to the number of
// unique lines this algorithm can handle. That limit is 1,112,063 unique
// lines.
func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, LineMap) {
chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
return []rune(chars1), []rune(chars2), lineMap
}

// DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineMap LineMap) []Diff {
hydrated := make([]Diff, 0, len(diffs))
for _, aDiff := range diffs {
chars := strings.Split(aDiff.Text, IndexSeparator)
text := make([]string, len(chars))
runes := []rune(aDiff.Text)
text := make([]string, len(runes))

for i, r := range chars {
i1, err := strconv.Atoi(r)
if err == nil {
text[i] = lineArray[i1]
}
for i, r := range runes {
text[i] = lineMap[r]
}

aDiff.Text = strings.Join(text, "")
Expand Down Expand Up @@ -1309,24 +1319,29 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di
}

// diffLinesToStrings splits two texts into a list of strings. Each string represents one line.
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) {
// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, LineMap) {
lineMap := LineMap{} // e.g. lineMap[4] == 'Hello\n'

lineHash := make(map[string]int)
//Each string has the index of lineArray which it points to
strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray, lineHash)
strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray, lineHash)
lineHash := make(map[string]rune)
//Each string has the index of lineMap which it points to
runes1 := dmp.diffLinesToRunesMunge(text1, lineMap, lineHash)
runes2 := dmp.diffLinesToRunesMunge(text2, lineMap, lineHash)

return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray
return string(runes1), string(runes2), lineMap
}

// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string.
func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []uint32 {
// Code points in the surrogate range are not valid for UTF-8.
const (
surrogateMin = 0xD800
surrogateMax = 0xDFFF
)

// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a LineMap.
func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineMap LineMap, lineHash map[string]rune) []rune {
// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
lineStart := 0
lineEnd := -1
strs := []uint32{}
var strs []rune

for lineEnd < len(text)-1 {
lineEnd = indexOf(text, "\n", lineStart)
Expand All @@ -1340,11 +1355,19 @@ func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]str
lineValue, ok := lineHash[line]

if ok {
strs = append(strs, uint32(lineValue))
strs = append(strs, lineValue)
} else {
*lineArray = append(*lineArray, line)
lineHash[line] = len(*lineArray) - 1
strs = append(strs, uint32(len(*lineArray)-1))
nextRune := rune(len(lineMap) + 1)
if nextRune >= surrogateMin {
// Skip invalid utf8 runes, if needed.
nextRune += surrogateMax - surrogateMin + 1
}
if nextRune > utf8.MaxRune {
panic("too many unique lines to use rune hashing")
}
lineMap[nextRune] = line
lineHash[line] = nextRune
strs = append(strs, nextRune)
}
}

Expand Down
141 changes: 111 additions & 30 deletions diffmatchpatch/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -308,18 +308,16 @@ func TestDiffLinesToChars(t *testing.T) {

ExpectedChars1 string
ExpectedChars2 string
ExpectedLines []string
ExpectedLines LineMap
}

dmp := New()

for i, tc := range []TestCase{
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
{"a", "b", "1", "2", []string{"", "a", "b"}},
{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", map[rune]string{1: "alpha\r\n", 2: "beta\r\n", 3: "\r\n"}},
{"a", "b", "\u0001", "\u0002", map[rune]string{1: "a", 2: "b"}},
// Omit final newline.
{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
// Same lines in Text1 and Text2
{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
{"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", map[rune]string{1: "alpha\n", 2: "beta\n", 3: "alpha"}},
} {
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
Expand All @@ -329,28 +327,28 @@ func TestDiffLinesToChars(t *testing.T) {

// More than 256 to reveal any 8-bit limitations.
n := 300
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
var charList []string
var lines []string
lineMap := LineMap{}
var charList []rune
for x := 1; x < n+1; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, strconv.Itoa(x))
line := strconv.Itoa(x) + "\n"
lines = append(lines, line)
lineMap[rune(x)] = line
charList = append(charList, rune(x))
}
lines := strings.Join(lineList, "")
chars := strings.Join(charList[:], ",")
assert.Equal(t, n, len(strings.Split(chars, ",")))
chars := string(charList)
assert.Equal(t, n, utf8.RuneCountInString(chars))

actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(strings.Join(lines, ""), "")
assert.Equal(t, chars, actualChars1)
assert.Equal(t, "", actualChars2)
assert.Equal(t, lineList, actualLines)
assert.Equal(t, lineMap, actualLines)
}

func TestDiffCharsToLines(t *testing.T) {
type TestCase struct {
Diffs []Diff
Lines []string
Lines map[rune]string

Expected []Diff
}
Expand All @@ -360,10 +358,10 @@ func TestDiffCharsToLines(t *testing.T) {
for i, tc := range []TestCase{
{
Diffs: []Diff{
{DiffEqual, "1,2,1"},
{DiffInsert, "2,1,2"},
{DiffEqual, "\u0001\u0002\u0001"},
{DiffInsert, "\u0002\u0001\u0002"},
},
Lines: []string{"", "alpha\n", "beta\n"},
Lines: map[rune]string{1: "alpha\n", 2: "beta\n"},

Expected: []Diff{
{DiffEqual, "alpha\nbeta\nalpha\n"},
Expand All @@ -377,19 +375,19 @@ func TestDiffCharsToLines(t *testing.T) {

// More than 256 to reveal any 8-bit limitations.
n := 300
lineList := []string{
"", // Account for the initial empty element of the lines array.
}
charList := []string{}
var lines []string
lineMap := LineMap{}
charList := []rune{}
for x := 1; x <= n; x++ {
lineList = append(lineList, strconv.Itoa(x)+"\n")
charList = append(charList, strconv.Itoa(x))
line := strconv.Itoa(x) + "\n"
lines = append(lines, line)
lineMap[rune(x)] = line
charList = append(charList, rune(x))
}
assert.Equal(t, n, len(charList))
chars := strings.Join(charList[:], ",")

actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineMap)
assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lines, "")}}, actual)
}

func TestDiffCleanupMerge(t *testing.T) {
Expand Down Expand Up @@ -1531,3 +1529,86 @@ func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) {
diffs = dmp.DiffCharsToLines(diffs, linearray)
}
}

func TestLineDiff(t *testing.T) {
t.Run("VeryLarge", func(t *testing.T) {
var beforeBuf, afterBuf bytes.Buffer

for i := 0; i <= surrogateMax+1; i++ {
beforeBuf.WriteString(fmt.Sprintf("%d\n", i))
afterBuf.WriteString(fmt.Sprintf("%d\n", i/2))
}

before, after := beforeBuf.String(), afterBuf.String()

diff := New().DiffMain(before, after, true)
checkDiffText(t, before, after, diff)
})

t.Run("Chars", func(t *testing.T) {
before := `1
2
3
4
5
6
7
8
9
`
after := `10
`

dmp := New()
txt1, txt2, lines := dmp.DiffLinesToChars(string(before), string(after))
diff := dmp.DiffMain(txt1, txt2, false)
diff = dmp.DiffCharsToLines(diff, lines)

checkDiffText(t, before, after, diff)
})

t.Run("Runes", func(t *testing.T) {
before := `1
2
3
4
5
6
7
8
9
`
after := `10
`

dmp := New()
txt1, txt2, lines := dmp.DiffLinesToRunes(string(before), string(after))
diff := dmp.DiffMainRunes(txt1, txt2, false)
diff = dmp.DiffCharsToLines(diff, lines)

checkDiffText(t, before, after, diff)
})
}

func checkDiffText(t *testing.T, before, after string, diff []Diff) {
t.Helper()
var foundBefore, foundAfter string
for _, d := range diff {
switch d.Type {
case DiffEqual:
foundBefore += d.Text
foundAfter += d.Text
case DiffDelete:
foundBefore += d.Text
case DiffInsert:
foundAfter += d.Text
}
}

if foundBefore != before {
t.Errorf("Expected before %q; found %q", before, foundBefore)
}
if foundAfter != after {
t.Errorf("Expected after %q; found %q", after, foundAfter)
}
}
18 changes: 0 additions & 18 deletions diffmatchpatch/stringutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
package diffmatchpatch

import (
"strconv"
"strings"
"unicode/utf8"
)
Expand Down Expand Up @@ -87,20 +86,3 @@ func runesIndex(r1, r2 []rune) int {
}
return -1
}

func intArrayToString(ns []uint32) string {
if len(ns) == 0 {
return ""
}

indexSeparator := IndexSeparator[0]

// Appr. 3 chars per num plus the comma.
b := []byte{}
for _, n := range ns {
b = strconv.AppendInt(b, int64(n), 10)
b = append(b, indexSeparator)
}
b = b[:len(b)-1]
return string(b)
}