diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 2a9f2dc..b975bb8 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go @@ -34,8 +34,6 @@ const ( DiffInsert Operation = 1 // DiffEqual item represents an equal diff. DiffEqual Operation = 0 - //IndexSeparator is used to seperate the array indexes in an index string - IndexSeparator = "," ) // Diff represents one diff operation @@ -195,7 +193,7 @@ func (dmp *DiffMatchPatch) diffCompute(text1, text2 []rune, checklines bool, dea // diffLineMode does a quick line-level diff on both []runes, then rediff the parts for greater accuracy. This speedup can produce non-minimal diffs. func (dmp *DiffMatchPatch) diffLineMode(text1, text2 []rune, deadline time.Time) []Diff { // Scan the text on a line-by-line basis first. - text1, text2, linearray := dmp.DiffLinesToRunes(string(text1), string(text2)) + text1, text2, linearray := dmp.diffLinesToRunes(text1, text2) diffs := dmp.diffMainRunes(text1, text2, false, deadline) @@ -392,28 +390,88 @@ func (dmp *DiffMatchPatch) diffBisectSplit(runes1, runes2 []rune, x, y int, // DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line. // It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes. func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) { - chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) - return chars1, chars2, lineArray + chars1, chars2, lineArray := dmp.DiffLinesToRunes(text1, text2) + return string(chars1), string(chars2), lineArray } -// DiffLinesToRunes splits two texts into a list of runes. +// DiffLinesToRunes splits two texts into a list of runes. Each rune represents one line. func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) { - chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) - return []rune(chars1), []rune(chars2), lineArray + // '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. + lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' + lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4 + + chars1 := dmp.diffLinesToRunesMunge(text1, &lineArray, lineHash) + chars2 := dmp.diffLinesToRunesMunge(text2, &lineArray, lineHash) + + return chars1, chars2, lineArray +} + +func (dmp *DiffMatchPatch) diffLinesToRunes(text1, text2 []rune) ([]rune, []rune, []string) { + return dmp.DiffLinesToRunes(string(text1), string(text2)) +} + +// diffLinesToRunesMunge splits a text into an array of strings, and reduces the texts to a []rune where each Unicode character represents one line. +// We use strings instead of []runes as input mainly because you can't use []rune as a map key. +func (dmp *DiffMatchPatch) diffLinesToRunesMunge(text string, lineArray *[]string, lineHash map[string]int) []rune { + // Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. + lineStart := 0 + lineEnd := -1 + runes := []rune{} + + for lineEnd < len(text)-1 { + lineEnd = indexOf(text, "\n", lineStart) + + if lineEnd == -1 { + lineEnd = len(text) - 1 + } + + line := text[lineStart : lineEnd+1] + lineStart = lineEnd + 1 + lineValue, ok := lineHash[line] + if !ok { + checkLineArray(lineArray) + + *lineArray = append(*lineArray, line) + lineValue = len(*lineArray) - 1 + lineHash[line] = lineValue + } + runes = append(runes, rune(lineValue)) + } + + return runes +} + +// checkLineArray checks the size of the slice and ensures that the index of the next element +// will be the valid rune. +func checkLineArray(a *[]string) { + // Runes in this range are invalid, utf8.ValidRune() returns false. + const ( + surrogateMin = 0xD800 + surrogateMax = 0xDFFF + ) + + // Check the index of the next element + switch len(*a) { + case surrogateMin: + // Skip invalid runes. + padding := [surrogateMax - surrogateMin + 1]string{} + *a = append(*a, padding[:]...) + + case utf8.MaxRune + 1: + // We can't do anything about it. + panic(fmt.Sprintf("rune can't be more than %d", utf8.MaxRune)) + } } // DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text. func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff { hydrated := make([]Diff, 0, len(diffs)) for _, aDiff := range diffs { - chars := strings.Split(aDiff.Text, IndexSeparator) + chars := aDiff.Text text := make([]string, len(chars)) for i, r := range chars { - i1, err := strconv.Atoi(r) - if err == nil { - text[i] = lineArray[i1] - } + text[i] = lineArray[r] } aDiff.Text = strings.Join(text, "") @@ -1307,46 +1365,3 @@ func (dmp *DiffMatchPatch) DiffFromDelta(text1 string, delta string) (diffs []Di return diffs, nil } - -// diffLinesToStrings splits two texts into a list of strings. Each string represents one line. -func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) { - // '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. - lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' - - //Each string has the index of lineArray which it points to - strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray) - strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray) - - return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray -} - -// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string. -func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string) []uint32 { - // Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. - lineHash := map[string]int{} // e.g. lineHash['Hello\n'] == 4 - lineStart := 0 - lineEnd := -1 - strs := []uint32{} - - for lineEnd < len(text)-1 { - lineEnd = indexOf(text, "\n", lineStart) - - if lineEnd == -1 { - lineEnd = len(text) - 1 - } - - line := text[lineStart : lineEnd+1] - lineStart = lineEnd + 1 - lineValue, ok := lineHash[line] - - if ok { - strs = append(strs, uint32(lineValue)) - } else { - *lineArray = append(*lineArray, line) - lineHash[line] = len(*lineArray) - 1 - strs = append(strs, uint32(len(*lineArray)-1)) - } - } - - return strs -} diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index acb97e3..be84a14 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go @@ -314,10 +314,10 @@ func TestDiffLinesToChars(t *testing.T) { dmp := New() for i, tc := range []TestCase{ - {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "1,2,3,3", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}}, - {"a", "b", "1", "2", []string{"", "a", "b"}}, + {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\u0001\u0002\u0003\u0003", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}}, + {"a", "b", "\u0001", "\u0002", []string{"", "a", "b"}}, // Omit final newline. - {"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}}, + {"alpha\nbeta\nalpha", "", "\u0001\u0002\u0003", "", []string{"", "alpha\n", "beta\n", "alpha"}}, } { actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2) assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc)) @@ -330,14 +330,14 @@ func TestDiffLinesToChars(t *testing.T) { lineList := []string{ "", // Account for the initial empty element of the lines array. } - var charList []string + var charList []rune for x := 1; x < n+1; x++ { lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, strconv.Itoa(x)) + charList = append(charList, rune(x)) } lines := strings.Join(lineList, "") - chars := strings.Join(charList[:], ",") - assert.Equal(t, n, len(strings.Split(chars, ","))) + chars := string(charList) + assert.Equal(t, n, utf8.RuneCountInString(chars)) actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(lines, "") assert.Equal(t, chars, actualChars1) @@ -358,8 +358,8 @@ func TestDiffCharsToLines(t *testing.T) { for i, tc := range []TestCase{ { Diffs: []Diff{ - {DiffEqual, "1,2,1"}, - {DiffInsert, "2,1,2"}, + {DiffEqual, "\u0001\u0002\u0001"}, + {DiffInsert, "\u0002\u0001\u0002"}, }, Lines: []string{"", "alpha\n", "beta\n"}, @@ -378,15 +378,14 @@ func TestDiffCharsToLines(t *testing.T) { lineList := []string{ "", // Account for the initial empty element of the lines array. } - charList := []string{} + charList := []rune{} for x := 1; x <= n; x++ { lineList = append(lineList, strconv.Itoa(x)+"\n") - charList = append(charList, strconv.Itoa(x)) + charList = append(charList, rune(x)) } assert.Equal(t, n, len(charList)) - chars := strings.Join(charList[:], ",") - actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, chars}}, lineList) + actual := dmp.DiffCharsToLines([]Diff{Diff{DiffDelete, string(charList)}}, lineList) assert.Equal(t, []Diff{Diff{DiffDelete, strings.Join(lineList, "")}}, actual) } @@ -1419,16 +1418,12 @@ func TestDiffMainWithTimeout(t *testing.T) { } func TestDiffMainWithCheckLines(t *testing.T) { + // Test cases must be at least 100 chars long to pass the cutoff. type TestCase struct { Text1 string Text2 string } - - dmp := New() - dmp.DiffTimeout = 0 - - // Test cases must be at least 100 chars long to pass the cutoff. - for i, tc := range []TestCase{ + tests := []TestCase{ { "1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n", "abcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\nabcdefghij\n", @@ -1441,7 +1436,39 @@ func TestDiffMainWithCheckLines(t *testing.T) { "1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n1234567890\n", "abcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n1234567890\n1234567890\n1234567890\nabcdefghij\n", }, - } { + } + // Add test case for the issue #115 + func() { + const before = `package main + +import ( + "fmt" +) + +/* +func upload(c echo.Context) error { + if err := r.ParseForm(); err != nil { + fmt.Fprintf(w, "ParseForm() err: %v", err) + return + } + fmt.Fprintf(w, "POST request successful") + path_ver := r.FormValue("path_ver") + ukclin_ver := r.FormValue("ukclin_ver") + + fmt.Fprintf(w, "Name = %s\n", path_ver) + fmt.Fprintf(w, "Address = %s\n", ukclin_ver) +} +*/ +` + after := strings.ReplaceAll(before, "\n", "\r\n") + + tests = append(tests, TestCase{Text1: before, Text2: after}) + }() + + dmp := New() + dmp.DiffTimeout = 0 + + for i, tc := range tests { resultWithoutCheckLines := dmp.DiffMain(tc.Text1, tc.Text2, false) resultWithCheckLines := dmp.DiffMain(tc.Text1, tc.Text2, true) diff --git a/diffmatchpatch/stringutil.go b/diffmatchpatch/stringutil.go index 44c4359..265f29c 100644 --- a/diffmatchpatch/stringutil.go +++ b/diffmatchpatch/stringutil.go @@ -9,7 +9,6 @@ package diffmatchpatch import ( - "strconv" "strings" "unicode/utf8" ) @@ -87,20 +86,3 @@ func runesIndex(r1, r2 []rune) int { } return -1 } - -func intArrayToString(ns []uint32) string { - if len(ns) == 0 { - return "" - } - - indexSeparator := IndexSeparator[0] - - // Appr. 3 chars per num plus the comma. - b := []byte{} - for _, n := range ns { - b = strconv.AppendInt(b, int64(n), 10) - b = append(b, indexSeparator) - } - b = b[:len(b)-1] - return string(b) -}