From b09babd9f4d22dc0c150280cb5a13a62c6b842c5 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 1 Aug 2024 12:17:37 +0100 Subject: [PATCH 1/5] Add Intl.Segmenter support and some initial tests. (Missing docs, coverage, release notes.) --- src/diff/word.js | 12 ++++++++++-- test/diff/word.js | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index ba8c682e..580c189f 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -58,8 +58,16 @@ wordDiff.equals = function(left, right, options) { return left.trim() === right.trim(); }; -wordDiff.tokenize = function(value) { - let parts = value.match(tokenizeIncludingWhitespace) || []; +wordDiff.tokenize = function(value, options = {}) { + let parts; + if (options.intlSegmenter) { + if (options.intlSegmenter.resolvedOptions().granularity != 'word') { + throw new Error('The segmenter passed must have a granularity of "word"'); + } + parts = Array.from(options.intlSegmenter.segment(value), segment => segment.segment); + } else { + parts = value.match(tokenizeIncludingWhitespace) || []; + } const tokens = []; let prevPart = null; parts.forEach(part => { diff --git a/test/diff/word.js b/test/diff/word.js index 708ea555..7f4b6dbc 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -209,6 +209,48 @@ describe('WordDiff', function() { ); expect(convertChangesToXML(diffResult)).to.equal('foo \tbar'); }); + + it('supports tokenizing with an Intl.Segmenter', () => { + // Example 1: Diffing Chinese text with no spaces. + // I am not a Chinese speaker but I believe these sentences to mean: + // 1. "I have (我有) many (很多) tables (桌子)" + // 2. "Mei (梅) has (有) many (很多) sons (儿子)" + // We want to see that diffWords will get the word counts right and won't try to treat the + // trailing 子 as common to both texts (since it's part of a different word each time). + // TODO: Check with a Chinese speaker that this example is correct Chinese. + const chineseSegmenter = new Intl.Segmenter('zh', {granularity: 'word'}); + const diffResult = diffWords('我有很多桌子。', '梅有很多儿子。', {intlSegmenter: chineseSegmenter}); + expect(diffResult).to.deep.equal([ + { count: 1, added: false, removed: true, value: '我有' }, + { count: 2, added: true, removed: false, value: '梅有' }, + { count: 1, added: false, removed: false, value: '很多' }, + { count: 1, added: false, removed: true, value: '桌子' }, + { count: 1, added: true, removed: false, value: '儿子' }, + { count: 1, added: false, removed: false, value: '。' } + ]); + + // Example 2: Should understand that a colon in the middle of a word is not a word break in + // Finnish (see https://stackoverflow.com/a/76402021/1709587) + const finnishSegmenter = new Intl.Segmenter('fi', {granularity: 'word'}); + expect(convertChangesToXML(diffWords( + 'USA:n nykyinen presidentti', + 'USA ja sen presidentti', + {intlSegmenter: finnishSegmenter} + ))).to.equal('USA:n nykyinenUSA ja sen presidentti'); + + // Example 3: Some English text, including contractions, long runs of arbitrary space, + // and punctuation, and using case insensitive mode, just to show all normal behaviour of + // diffWords still works with a segmenter + const englishSegmenter = new Intl.Segmenter('en', {granularity: 'word'}); + expect(convertChangesToXML(diffWords( + "There wasn't time \n \t for all that. He thought...", + "There isn't time \n \t left for all that, he thinks.", + {intlSegmenter: englishSegmenter, ignoreCase: true} + ))).to.equal( + "There wasn'tisn't time \n \t left " + + 'for all that., he thoughtthinks...' + ); + }); }); describe('#diffWordsWithSpace', function() { From 714de4cb432619642ca07a0f26aae458519bf07f Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 1 Aug 2024 13:03:40 +0100 Subject: [PATCH 2/5] Get to 100% coverage --- test/diff/word.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/diff/word.js b/test/diff/word.js index 7f4b6dbc..f81a4e45 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -251,6 +251,13 @@ describe('WordDiff', function() { + 'for all that., he thoughtthinks...' ); }); + + it('rejects attempts to use a non-word Intl.Segmenter', () => { + const segmenter = new Intl.Segmenter('en', {granularity: 'grapheme'}); + expect(() => { + diffWords('foo', 'bar', {intlSegmenter: segmenter}); + }).to['throw']('The segmenter passed must have a granularity of "word"'); + }); }); describe('#diffWordsWithSpace', function() { From b998ddda2102f5f57aa1b379d2fada4d92ee06a7 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 1 Aug 2024 13:10:30 +0100 Subject: [PATCH 3/5] Document intlSegmenter --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 81a96fe6..78f1a46b 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Broadly, jsdiff's diff functions all take an old text and a new text and perform Options * `ignoreCase`: Same as in `diffChars`. Defaults to false. + * `intlSegmenter`: An optional [`Intl.Segmenter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter) object (which must have a `granularity` of `'word'`) for `diffWords` to use to split the text into words. The language of the segmenter can be whatever you want, so you can use this to support language-specific nuances like colons in the middle of words in Finnish and Swedish - at least to the extent that the `Intl.Segmenter` implementation you're using supports them. (By default, `diffWords` uses some crude regex logic for splitting text into words instead of an `Intl.Segmenter`, which will tend to give worse results than `Intl.Segmenter` would, but ensures the results are consistent across environments; `Intl.Segmenter` behaviour may one day differ significantly between browsers, since the spec is not at all prescriptive about what the actual segmenting rules in an implementation should be. If you want to use an `Intl.Segmenter` but ensure consistency across browsers, use a polyfill.) * `Diff.diffWordsWithSpace(oldStr, newStr[, options])` - diffs two blocks of text, treating each word, punctuation mark, newline, or run of (non-newline) whitespace as a token. From bd052a31cc11c2532d10c12861b7b904c53b0a02 Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 1 Aug 2024 13:21:34 +0100 Subject: [PATCH 4/5] Improve docs --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 78f1a46b..33d0629f 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,11 @@ Broadly, jsdiff's diff functions all take an old text and a new text and perform Options * `ignoreCase`: Same as in `diffChars`. Defaults to false. - * `intlSegmenter`: An optional [`Intl.Segmenter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter) object (which must have a `granularity` of `'word'`) for `diffWords` to use to split the text into words. The language of the segmenter can be whatever you want, so you can use this to support language-specific nuances like colons in the middle of words in Finnish and Swedish - at least to the extent that the `Intl.Segmenter` implementation you're using supports them. (By default, `diffWords` uses some crude regex logic for splitting text into words instead of an `Intl.Segmenter`, which will tend to give worse results than `Intl.Segmenter` would, but ensures the results are consistent across environments; `Intl.Segmenter` behaviour may one day differ significantly between browsers, since the spec is not at all prescriptive about what the actual segmenting rules in an implementation should be. If you want to use an `Intl.Segmenter` but ensure consistency across browsers, use a polyfill.) + * `intlSegmenter`: An optional [`Intl.Segmenter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter) object (which must have a `granularity` of `'word'`) for `diffWords` to use to split the text into words. + + By default, `diffWords` does not use an `Intl.Segmenter`, just some regexes for splitting text into words. This will tend to give worse results than `Intl.Segmenter` would, but ensures the results are consistent across environments; `Intl.Segmenter` behaviour is only loosely specced and the implementations in browsers could in principle change dramatically in future. If you want to use `diffWords` with an `Intl.Segmenter` but ensure it behaves the same whatever environment you run it in, use an `Intl.Segmenter` polyfill instead of the JavaScript engine's native `Intl.Segmenter` implementation. + + Using an `Intl.Segmenter` should allow better word-level diffing of non-English text than the default behaviour. For instance, `Intl.Segmenter`s can generally identify via built-in dictionaries which sequences of adjacent Chinese characters form words, allowing word-level diffing of Chinese. By specifying a language when instantiating the segmenter (e.g. `new Intl.Segmenter('sv', {granularity: 'word'})`) you can also support language-specific rules, like treating Swedish's colon separated contractions (like *k:a* for *kyrka*) as single words; by default this would be seen as two words separated by a colon. * `Diff.diffWordsWithSpace(oldStr, newStr[, options])` - diffs two blocks of text, treating each word, punctuation mark, newline, or run of (non-newline) whitespace as a token. From e7033079d2dfabf8f4a673f28525fec1838c8dad Mon Sep 17 00:00:00 2001 From: Mark Amery Date: Thu, 1 Aug 2024 13:26:50 +0100 Subject: [PATCH 5/5] Add release notes --- release-notes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/release-notes.md b/release-notes.md index 7e237cb6..9966de83 100644 --- a/release-notes.md +++ b/release-notes.md @@ -34,6 +34,7 @@ * The context line immediately before and immediately after an insertion must match exactly between the hunk and the file for a hunk to apply. (Previously this was not required.) - [#535](https://github.com/kpdecker/jsdiff/pull/535) **A bug in patch generation functions is now fixed** that would sometimes previously cause `\ No newline at end of file` to appear in the wrong place in the generated patch, resulting in the patch being invalid. - [#535](https://github.com/kpdecker/jsdiff/pull/535) **Passing `newlineIsToken: true` to *patch*-generation functions is no longer allowed.** (Passing it to `diffLines` is still supported - it's only functions like `createPatch` where passing `newlineIsToken` is now an error.) Allowing it to be passed never really made sense, since in cases where the option had any effect on the output at all, the effect tended to be causing a garbled patch to be created that couldn't actually be applied to the source file. +- [#539](https://github.com/kpdecker/jsdiff/pull/539) **`diffWords` now takes an optional `intlSegmenter` option** which should be an `Intl.Segmenter` with word-level granularity. This provides better tokenization of text into words than the default behaviour, even for English but especially for some other languages for which the default behaviour is poor. ## v5.2.0