Skip to content
This repository was archived by the owner on Aug 5, 2024. It is now read-only.
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit a5f35ab

Browse files
committedDec 13, 2019
Handle cases where we delete characters
In the previous iteration of this patch we were only properly handling cases where a new surrogate pair was inserted in between two existing pairs whose high surrogates all matched. Unfortunately when swapping characters or performing any edits where we delete a surrogate pair the patch failed because it only carried the trailing high surrogate over to the next group instead of distributing it to any insert _and_ delete groups following an equality group. In this patch I've updated the JavaScript library to properly distribute the trailing high surrogate.
1 parent ea303f2 commit a5f35ab

File tree

3 files changed

+109
-59
lines changed

3 files changed

+109
-59
lines changed
 

‎javascript/diff_match_patch.js

Lines changed: 53 additions & 52 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎javascript/diff_match_patch_uncompressed.js

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,28 +1366,40 @@ diff_match_patch.prototype.diff_toDelta = function(diffs) {
13661366
var thisTop = thisDiff[1][0];
13671367
var thisEnd = thisDiff[1][thisDiff[1].length - 1];
13681368

1369+
if (0 === thisDiff[1].length) {
1370+
continue;
1371+
}
1372+
1373+
// trap a trailing high-surrogate so we can
1374+
// distribute it to the successive edits
13691375
if (thisEnd && this.isHighSurrogate(thisEnd)) {
1376+
lastEnd = thisEnd;
13701377
thisDiff[1] = thisDiff[1].slice(0, -1);
13711378
}
13721379

13731380
if (lastEnd && thisTop && this.isHighSurrogate(lastEnd) && this.isLowSurrogate(thisTop)) {
13741381
thisDiff[1] = lastEnd + thisDiff[1];
13751382
}
13761383

1377-
lastEnd = thisEnd;
1378-
if ( 0 === thisDiff[1].length ) {
1384+
// we have to carry the surrogate half through
1385+
// any successive insert/delete edits
1386+
if (DIFF_EQUAL === thisDiff[0]) {
1387+
lastEnd = thisEnd;
1388+
}
1389+
1390+
if (0 === thisDiff[1].length) {
13791391
continue;
13801392
}
13811393

1382-
switch (diffs[x][0]) {
1394+
switch (thisDiff[0]) {
13831395
case DIFF_INSERT:
1384-
text[x] = '+' + encodeURI(diffs[x][1]);
1396+
text.push('+' + encodeURI(thisDiff[1]));
13851397
break;
13861398
case DIFF_DELETE:
1387-
text[x] = '-' + diffs[x][1].length;
1399+
text.push('-' + thisDiff[1].length);
13881400
break;
13891401
case DIFF_EQUAL:
1390-
text[x] = '=' + diffs[x][1].length;
1402+
text.push('=' + thisDiff[1].length);
13911403
break;
13921404
}
13931405
}

‎javascript/tests/diff_match_patch_test.js

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,44 @@ function testDiffDelta() {
548548
newText = applyRandomTextEdit(originalText);
549549
dmp.patch_toText(dmp.patch_make(originalText, newText));
550550
}
551-
});
551+
})();
552+
553+
// Unicode - splitting surrogates
554+
try {
555+
assertEquivalent(
556+
dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]),
557+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71'))
558+
);
559+
} catch ( e ) {
560+
assertEquals('Inserting similar surrogate pair', 'crashed');
561+
}
562+
563+
try {
564+
assertEquivalent(
565+
dmp.diff_toDelta([[DIFF_DELETE, '\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd71']]),
566+
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_DELETE, '\udd70'], [DIFF_INSERT, '\udd71']]),
567+
);
568+
} catch ( e ) {
569+
assertEquals('Swap surrogate pair', 'crashed');
570+
}
571+
572+
try {
573+
assertEquivalent(
574+
dmp.diff_toDelta([[DIFF_INSERT, '\ud83c\udd70'], [DIFF_DELETE, '\ud83c\udd71']]),
575+
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_INSERT, '\udd70'], [DIFF_DELETE, '\udd71']]),
576+
);
577+
} catch ( e ) {
578+
assertEquals('Swap surrogate pair', 'crashed');
579+
}
580+
581+
// Empty diff groups
582+
assertEquivalent(
583+
JSON.stringify(dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']])),
584+
JSON.stringify(dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']])),
585+
);
586+
587+
// Invalid UTF8 but valid surrogate pairs
588+
552589

553590
// Verify pool of unchanged characters.
554591
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];

0 commit comments

Comments
 (0)
This repository has been archived.