From c79548ee7c3ea188e6fb5eaf851181bfa2f19053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=98=9F=E6=98=9F?= Date: Thu, 2 Mar 2023 15:15:23 +0800 Subject: [PATCH 1/4] [fix] str.split can not handle surrogate pair, replaced with Array.from --- src/diff/base.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diff/base.js b/src/diff/base.js index b11e7e52..b2514f10 100644 --- a/src/diff/base.js +++ b/src/diff/base.js @@ -167,7 +167,7 @@ Diff.prototype = { return value; }, tokenize(value) { - return value.split(''); + return Array.from(value); }, join(chars) { return chars.join(''); From d856723a2efb95177183230f674d397f98c399a1 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 16:44:17 +0000 Subject: [PATCH 2/4] Add some docs --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 142727ac..72fe9caf 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,8 @@ Broadly, jsdiff's diff functions all take an old text and a new text and perform * `Diff.diffChars(oldStr, newStr[, options])` - diffs two blocks of text, treating each character as a token. + ("Characters" here means Unicode code points - the elements you get when you loop over a string with a `for ... of ...` loop.) + Returns a list of [change objects](#change-objects). Options From 3b96b832f02097444c38eb226cdbbf82f7f85a17 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 16:48:53 +0000 Subject: [PATCH 3/4] Add release notes --- release-notes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/release-notes.md b/release-notes.md index 3a27d6e4..08b41a06 100644 --- a/release-notes.md +++ b/release-notes.md @@ -5,6 +5,7 @@ [Commits](https://github.com/kpdecker/jsdiff/compare/master...v6.0.0-staging) - [#435](https://github.com/kpdecker/jsdiff/pull/435) Fix `parsePatch` handling of control characters. `parsePatch` used to interpret various unusual control characters - namely vertical tabs, form feeds, lone carriage returns without a line feed, and EBCDIC NELs - as line breaks when parsing a patch file. This was inconsistent with the behavior of both JsDiff's own `diffLines` method and also the Unix `diff` and `patch` utils, which all simply treat those control characters as ordinary characters. The result of this discrepancy was that some well-formed patches - produced either by `diff` or by JsDiff itself and handled properly by the `patch` util - would be wrongly parsed by `parsePatch`, with the effect that it would disregard the remainder of a hunk after encountering one of these control characters. +- [#500](https://github.com/kpdecker/jsdiff/pull/500) **`diffChars` now diffs Unicode code points** instead of UTF-16 code units. - [#439](https://github.com/kpdecker/jsdiff/pull/439) Prefer diffs that order deletions before insertions. When faced with a choice between two diffs with an equal total edit distance, the Myers diff algorithm generally prefers one that does deletions before insertions rather than insertions before deletions. For instance, when diffing `abcd` against `acbd`, it will prefer a diff that says to delete the `b` and then insert a new `b` after the `c`, over a diff that says to insert a `c` before the `b` and then delete the existing `c`. JsDiff deviated from the published Myers algorithm in a way that led to it having the opposite preference in many cases, including that example. This is now fixed, meaning diffs output by JsDiff will more accurately reflect what the published Myers diff algorithm would output. - [#455](https://github.com/kpdecker/jsdiff/pull/455) The `added` and `removed` properties of change objects are now guaranteed to be set to a boolean value. (Previously, they would be set to `undefined` or omitted entirely instead of setting them to false.) - [#464](https://github.com/kpdecker/jsdiff/pull/464) Specifying `{maxEditLength: 0}` now sets a max edit length of 0 instead of no maximum. From 4fea333e3fa7f6fb29d2c1c490f397559f531263 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Fri, 8 Mar 2024 17:04:41 +0000 Subject: [PATCH 4/4] Add unit test of new behaviour --- test/diff/character.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/diff/character.js b/test/diff/character.js index e5e58564..4a731825 100644 --- a/test/diff/character.js +++ b/test/diff/character.js @@ -27,6 +27,13 @@ describe('diff/character', function() { }); }); + it('should treat a code point that consists of two UTF-16 code units as a single character, not two', function() { + const diffResult = diffChars('𝟘𝟙𝟚𝟛', '𝟘𝟙𝟚𝟜𝟝𝟞'); + expect(diffResult.length).to.equal(3); + expect(diffResult[2].count).to.equal(3); + expect(convertChangesToXML(diffResult)).to.equal('𝟘𝟙𝟚𝟛𝟜𝟝𝟞'); + }); + describe('case insensitivity', function() { it("is considered when there's no difference", function() { const diffResult = diffChars('New Value.', 'New value.', {ignoreCase: true});