sergi · schroederc · Mar 31, 2023
diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go
@@ -22,6 +22,9 @@ import (
 	"unicode/utf8"
 )
 
+// LineMap is a mapping from a line hash to its text.
+type LineMap map[rune]string
+
 // Operation defines the operation of a diff item.
 type Operation int8
 
@@ -34,8 +37,6 @@ const (
 	DiffInsert Operation = 1
 	// DiffEqual item represents an equal diff.
 	DiffEqual Operation = 0
-	//IndexSeparator is used to seperate the array indexes in an index string
-	IndexSeparator = ","
 )
 
 // Diff represents one diff operation
@@ -83,12 +84,16 @@ func splice(slice []Diff, index int, amount int, elements ...Diff) []Diff {
 
 // DiffMain finds the differences between two texts.
 // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
+//
+// Note: if checklines is true, the limitation noted in DiffLinesToChars applies
 func (dmp *DiffMatchPatch) DiffMain(text1, text2 string, checklines bool) []Diff {
 	return dmp.DiffMainRunes([]rune(text1), []rune(text2), checklines)
 }
 
 // DiffMainRunes finds the differences between two rune sequences.
 // If an invalid UTF-8 sequence is encountered, it will be replaced by the Unicode replacement character.
+//
+// Note: if checklines is true, the limitation noted in DiffLinesToRunes applies
 func (dmp *DiffMatchPatch) DiffMainRunes(text1, text2 []rune, checklines bool) []Diff {
 	var deadline time.Time
 	if dmp.DiffTimeout > 0 {
@@ -391,29 +396,34 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int,
 
 // DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line.
 // It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes.
-func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) {
-	chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
-	return chars1, chars2, lineArray
+//
+// Note: since we hash lines to runes, there is an upper limit to the number of
+// unique lines this algorithm can handle.  That limit is 1,112,063 unique
+// lines.
+func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, LineMap) {
+	chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
+	return chars1, chars2, lineMap
 }
 
 // DiffLinesToRunes splits two texts into a list of runes.
-func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) {
-	chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2)
-	return []rune(chars1), []rune(chars2), lineArray
+//
+// Note: since we hash lines to runes, there is an upper limit to the number of
+// unique lines this algorithm can handle.  That limit is 1,112,063 unique
+// lines.
+func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, LineMap) {
+	chars1, chars2, lineMap := dmp.diffLinesToStrings(text1, text2)
+	return []rune(chars1), []rune(chars2), lineMap
 }
 
 // DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text.
-func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff {
+func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineMap LineMap) []Diff {
 	hydrated := make([]Diff, 0, len(diffs))
 	for _, aDiff := range diffs {
-		chars := strings.Split(aDiff.Text, IndexSeparator)
-		text := make([]string, len(chars))
+		runes := []rune(aDiff.Text)
+		text := make([]string, len(runes))
 
-		for i, r := range chars {
-			i1, err := strconv.Atoi(r)
-			if err == nil {
-				text[i] = lineArray[i1]
-			}
+		for i, r := range runes {
+			text[i] = lineMap[r]
 		}
 
 		aDiff.Text = strings.Join(text, "")
@@ -1309,24 +1319,29 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di
 }
 
 // diffLinesToStrings splits two texts into a list of strings. Each string represents one line.
-func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) {
-	// '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character.
-	lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n'
+func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, LineMap) {
+	lineMap := LineMap{} // e.g. lineMap[4] == 'Hello\n'
 
-	lineHash := make(map[string]int)
-	//Each string has the index of lineArray which it points to
-	strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray, lineHash)
-	strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray, lineHash)
+	lineHash := make(map[string]rune)
+	//Each string has the index of lineMap which it points to
+	runes1 := dmp.diffLinesToRunesMunge(text1, lineMap, lineHash)
+	runes2 := dmp.diffLinesToRunesMunge(text2, lineMap, lineHash)
 
-	return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray
+	return string(runes1), string(runes2), lineMap
 }
 
-// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string.
-func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []uint32 {
+// Code points in the surrogate range are not valid for UTF-8.
+const (
+	surrogateMin = 0xD800
+	surrogateMax = 0xDFFF
+)
+
+// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a LineMap.
+func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineMap LineMap, lineHash map[string]rune) []rune {
 	// Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect.
 	lineStart := 0
 	lineEnd := -1
-	strs := []uint32{}
+	var strs []rune
 
 	for lineEnd < len(text)-1 {
 		lineEnd = indexOf(text, "\n", lineStart)
@@ -1340,11 +1355,19 @@ func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]str
 		lineValue, ok := lineHash[line]
 
 		if ok {
-			strs = append(strs, uint32(lineValue))
+			strs = append(strs, lineValue)
 		} else {
-			*lineArray = append(*lineArray, line)
-			lineHash[line] = len(*lineArray) - 1
-			strs = append(strs, uint32(len(*lineArray)-1))
+			nextRune := rune(len(lineMap) + 1)
+			if nextRune >= surrogateMin {
+				// Skip invalid utf8 runes, if needed.
+				nextRune += surrogateMax - surrogateMin + 1
+			}
+			if nextRune > utf8.MaxRune {
+				panic("too many unique lines to use rune hashing")
+			}
+			lineMap[nextRune] = line
+			lineHash[line] = nextRune
+			strs = append(strs, nextRune)
 		}
 	}
 

diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go
@@ -308,18 +308,16 @@ func TestDiffLinesToChars(t *testing.T) {
 
 		ExpectedChars1 string
 		ExpectedChars2 string
-		ExpectedLines  []string
+		ExpectedLines  LineMap
 	}
 
 	dmp := New()
 
 	for i, tc := range []TestCase{
-		{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}},
-		{"a", "b", "1", "2", []string{"", "a", "b"}},
+		{"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", map[rune]string{1: "alpha\r\n", 2: "beta\r\n", 3: "\r\n"}},
+		{"a", "b", "\u0001", "\u0002", map[rune]string{1: "a", 2: "b"}},
 		// Omit final newline.
-		{"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}},
-		// Same lines in Text1 and Text2
-		{"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}},
+		{"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", map[rune]string{1: "alpha\n", 2: "beta\n", 3: "alpha"}},
 	} {
 		actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2)
 		assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc))
@@ -329,28 +327,28 @@ func TestDiffLinesToChars(t *testing.T) {
 
 	// More than 256 to reveal any 8-bit limitations.
 	n := 300
-	lineList := []string{
-		"", // Account for the initial empty element of the lines array.
-	}
-	var charList []string
+	var lines []string
+	lineMap := LineMap{}
+	var charList []rune
 	for x := 1; x < n+1; x++ {
-		lineList = append(lineList, strconv.Itoa(x)+"\n")
-		charList = append(charList, strconv.Itoa(x))
+		line := strconv.Itoa(x) + "\n"
+		lines = append(lines, line)
+		lineMap[rune(x)] = line
+		charList = append(charList, rune(x))
 	}
-	lines := strings.Join(lineList, "")
-	chars := strings.Join(charList[:], ",")
-	assert.Equal(t, n, len(strings.Split(chars, ",")))
+	chars := string(charList)
+	assert.Equal(t, n, utf8.RuneCountInString(chars))
 
-	actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "")
+	actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(strings.Join(lines, ""), "")
 	assert.Equal(t, chars, actualChars1)
 	assert.Equal(t, "", actualChars2)
-	assert.Equal(t, lineList, actualLines)
+	assert.Equal(t, lineMap, actualLines)
 }
 
 func TestDiffCharsToLines(t *testing.T) {
 	type TestCase struct {
 		Diffs []Diff
-		Lines []string
+		Lines map[rune]string
 
 		Expected []Diff
 	}
@@ -360,10 +358,10 @@ func TestDiffCharsToLines(t *testing.T) {
 	for i, tc := range []TestCase{
 		{
 			Diffs: []Diff{
-				{DiffEqual, "1,2,1"},
-				{DiffInsert, "2,1,2"},
+				{DiffEqual, "\u0001\u0002\u0001"},
+				{DiffInsert, "\u0002\u0001\u0002"},
 			},
-			Lines: []string{"", "alpha\n", "beta\n"},
+			Lines: map[rune]string{1: "alpha\n", 2: "beta\n"},
 
 			Expected: []Diff{
 				{DiffEqual, "alpha\nbeta\nalpha\n"},
@@ -377,19 +375,19 @@ func TestDiffCharsToLines(t *testing.T) {
 
 	// More than 256 to reveal any 8-bit limitations.
 	n := 300
-	lineList := []string{
-		"", // Account for the initial empty element of the lines array.
-	}
-	charList := []string{}
+	var lines []string
+	lineMap := LineMap{}
+	charList := []rune{}
 	for x := 1; x <= n; x++ {
-		lineList = append(lineList, strconv.Itoa(x)+"\n")
-		charList = append(charList, strconv.Itoa(x))
+		line := strconv.Itoa(x) + "\n"
+		lines = append(lines, line)
+		lineMap[rune(x)] = line
+		charList = append(charList, rune(x))
 	}
 	assert.Equal(t, n, len(charList))
-	chars := strings.Join(charList[:], ",")
 
-	actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList)
-	assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual)
+	actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineMap)
+	assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lines, "")}}, actual)
 }
 
 func TestDiffCleanupMerge(t *testing.T) {
@@ -1531,3 +1529,86 @@ func BenchmarkDiffMainRunesLargeDiffLines(b *testing.B) {
 		diffs = dmp.DiffCharsToLines(diffs, linearray)
 	}
 }
+
+func TestLineDiff(t *testing.T) {
+	t.Run("VeryLarge", func(t *testing.T) {
+		var beforeBuf, afterBuf bytes.Buffer
+
+		for i := 0; i <= surrogateMax+1; i++ {
+			beforeBuf.WriteString(fmt.Sprintf("%d\n", i))
+			afterBuf.WriteString(fmt.Sprintf("%d\n", i/2))
+		}
+
+		before, after := beforeBuf.String(), afterBuf.String()
+
+		diff := New().DiffMain(before, after, true)
+		checkDiffText(t, before, after, diff)
+	})
+
+	t.Run("Chars", func(t *testing.T) {
+		before := `1
+2
+3
+4
+5
+6
+7
+8
+9
+`
+		after := `10
+`
+
+		dmp := New()
+		txt1, txt2, lines := dmp.DiffLinesToChars(string(before), string(after))
+		diff := dmp.DiffMain(txt1, txt2, false)
+		diff = dmp.DiffCharsToLines(diff, lines)
+
+		checkDiffText(t, before, after, diff)
+	})
+
+	t.Run("Runes", func(t *testing.T) {
+		before := `1
+2
+3
+4
+5
+6
+7
+8
+9
+`
+		after := `10
+`
+
+		dmp := New()
+		txt1, txt2, lines := dmp.DiffLinesToRunes(string(before), string(after))
+		diff := dmp.DiffMainRunes(txt1, txt2, false)
+		diff = dmp.DiffCharsToLines(diff, lines)
+
+		checkDiffText(t, before, after, diff)
+	})
+}
+
+func checkDiffText(t *testing.T, before, after string, diff []Diff) {
+	t.Helper()
+	var foundBefore, foundAfter string
+	for _, d := range diff {
+		switch d.Type {
+		case DiffEqual:
+			foundBefore += d.Text
+			foundAfter += d.Text
+		case DiffDelete:
+			foundBefore += d.Text
+		case DiffInsert:
+			foundAfter += d.Text
+		}
+	}
+
+	if foundBefore != before {
+		t.Errorf("Expected before %q; found %q", before, foundBefore)
+	}
+	if foundAfter != after {
+		t.Errorf("Expected after %q; found %q", after, foundAfter)
+	}
+}
diff --git a/diffmatchpatch/stringutil.go b/diffmatchpatch/stringutil.go
@@ -9,7 +9,6 @@
 package diffmatchpatch
 
 import (
-	"strconv"
 	"strings"
 	"unicode/utf8"
 )
@@ -87,20 +86,3 @@ func runesIndex(r1, r2 []rune) int {
 	}
 	return -1
 }
-
-func intArrayToString(ns []uint32) string {
-	if len(ns) == 0 {
-		return ""
-	}
-
-	indexSeparator := IndexSeparator[0]
-
-	// Appr. 3 chars per num plus the comma.
-	b := []byte{}
-	for _, n := range ns {
-		b = strconv.AppendInt(b, int64(n), 10)
-		b = append(b, indexSeparator)
-	}
-	b = b[:len(b)-1]
-	return string(b)
-}