Skip to content

Commit

Permalink
[api-minor] Don't normalize the text used in the text layer.
Browse files Browse the repository at this point in the history
Some arabic chars like \ufe94 could be searched in a pdf, hence it must be normalized
when creating the search query. So to avoid to duplicate the normalization code,
everything is moved in the find controller.
The previous code to normalize text was using NFKC but with a hardcoded map, hence it
has been replaced by the use of normalize("NFKC") (it helps to reduce the bundle size
by 30kb).
In playing with this \ufe94 char, I noticed that the bidi algorithm wasn't taking into
account some RTL unicode ranges, the generated font wasn't embedding the mapping this
char and the unicode ranges in the OS/2 table weren't up-to-date.

When normalized some chars can be replaced by several ones and it induced to have
some extra chars in the text layer. To avoid any regression, when copying some text
from the text layer, a copied string is normalized (NFKC) before being put in the
clipboard (it works like this in either Acrobat or Chrome).
  • Loading branch information
calixteman committed Apr 17, 2023
1 parent 3e08eee commit 117bbf7
Show file tree
Hide file tree
Showing 22 changed files with 447 additions and 1,672 deletions.
6 changes: 5 additions & 1 deletion src/core/bidi.js
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,11 @@ function bidi(str, startLevel = -1, vertical = false) {
if (!charType) {
warn("Bidi: invalid Unicode character " + charCode.toString(16));
}
} else if (0x0700 <= charCode && charCode <= 0x08ac) {
} else if (
(0x0700 <= charCode && charCode <= 0x08ac) ||
(0xfb50 <= charCode && charCode <= 0xfdff) ||
(0xfe70 <= charCode && charCode <= 0xfeff)
) {
charType = "AL";
}
if (charType === "R" || charType === "AL" || charType === "AN") {
Expand Down
9 changes: 8 additions & 1 deletion src/core/document.js
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,13 @@ class Page {
});
}

extractTextContent({ handler, task, includeMarkedContent, sink }) {
extractTextContent({
handler,
task,
includeMarkedContent,
disableNormalization,
sink,
}) {
const contentStreamPromise = this.getContentStream();
const resourcesPromise = this.loadResources([
"ExtGState",
Expand Down Expand Up @@ -539,6 +545,7 @@ class Page {
task,
resources: this.resources,
includeMarkedContent,
disableNormalization,
sink,
viewBox: this.view,
});
Expand Down
10 changes: 8 additions & 2 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {
IDENTITY_MATRIX,
info,
isArrayEqual,
normalizeUnicode,
OPS,
shadow,
stringToPDFString,
Expand Down Expand Up @@ -2271,6 +2272,7 @@ class PartialEvaluator {
seenStyles = new Set(),
viewBox,
markedContentData = null,
disableNormalization = false,
}) {
// Ensure that `resources`/`stateManager` is correctly initialized,
// even if the provided parameter is e.g. `null`.
Expand Down Expand Up @@ -2524,7 +2526,10 @@ class PartialEvaluator {
}

function runBidiTransform(textChunk) {
const text = textChunk.str.join("");
let text = textChunk.str.join("");
if (!disableNormalization) {
text = normalizeUnicode(text);
}
const bidiResult = bidi(text, -1, textChunk.vertical);
return {
str: bidiResult.str,
Expand Down Expand Up @@ -2859,7 +2864,7 @@ class PartialEvaluator {
textChunk.prevTransform = getCurrentTextTransform();
}

const glyphUnicode = glyph.normalizedUnicode;
const glyphUnicode = glyph.unicode;
if (saveLastChar(glyphUnicode)) {
// The two last chars are a non-whitespace followed by a whitespace
// and then this non-whitespace, so we insert a whitespace here.
Expand Down Expand Up @@ -3242,6 +3247,7 @@ class PartialEvaluator {
seenStyles,
viewBox,
markedContentData,
disableNormalization,
})
.then(function () {
if (!sinkWrapper.enqueueInvoked) {
Expand Down
32 changes: 6 additions & 26 deletions src/core/fonts.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,9 @@ import {
} from "./fonts_utils.js";
import {
getCharUnicodeCategory,
getNormalizedUnicodes,
getUnicodeForGlyph,
getUnicodeRangeFor,
mapSpecialUnicodeValues,
reverseIfRtl,
} from "./unicode.js";
import { getDingbatsGlyphsUnicode, getGlyphsUnicode } from "./glyphlist.js";
import {
Expand Down Expand Up @@ -277,24 +275,6 @@ class Glyph {
/* nonSerializable = */ true
);
}

/**
* This property, which is only used by `PartialEvaluator.getTextContent`,
* is purposely made non-serializable.
* @type {string}
*/
get normalizedUnicode() {
return shadow(
this,
"normalizedUnicode",
reverseIfRtl(Glyph._NormalizedUnicodes[this.unicode] || this.unicode),
/* nonSerializable = */ true
);
}

static get _NormalizedUnicodes() {
return shadow(this, "_NormalizedUnicodes", getNormalizedUnicodes());
}
}

function int16(b0, b1) {
Expand Down Expand Up @@ -507,6 +487,9 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) {
const privateUseOffetStart = PRIVATE_USE_AREAS[privateUseAreaIndex][0];
let nextAvailableFontCharCode = privateUseOffetStart;
let privateUseOffetEnd = PRIVATE_USE_AREAS[privateUseAreaIndex][1];
const isInPrivateArea = code =>
(PRIVATE_USE_AREAS[0][0] <= code && code <= PRIVATE_USE_AREAS[0][1]) ||
(PRIVATE_USE_AREAS[1][0] <= code && code <= PRIVATE_USE_AREAS[1][1]);
for (let originalCharCode in charCodeToGlyphId) {
originalCharCode |= 0;
let glyphId = charCodeToGlyphId[originalCharCode];
Expand Down Expand Up @@ -539,11 +522,7 @@ function adjustMapping(charCodeToGlyphId, hasGlyph, newGlyphZeroId, toUnicode) {
if (typeof unicode === "string") {
unicode = unicode.codePointAt(0);
}
if (
unicode &&
unicode < privateUseOffetStart &&
!usedGlyphIds.has(glyphId)
) {
if (unicode && !isInPrivateArea(unicode) && !usedGlyphIds.has(glyphId)) {
toUnicodeExtraMap.set(unicode, glyphId);
usedGlyphIds.add(glyphId);
}
Expand Down Expand Up @@ -785,6 +764,7 @@ function createOS2Table(properties, charstrings, override) {

let firstCharIndex = null;
let lastCharIndex = 0;
let position = -1;

if (charstrings) {
for (let code in charstrings) {
Expand All @@ -796,7 +776,7 @@ function createOS2Table(properties, charstrings, override) {
lastCharIndex = code;
}

const position = getUnicodeRangeFor(code);
position = getUnicodeRangeFor(code, position);
if (position < 32) {
ulUnicodeRange1 |= 1 << position;
} else if (position < 64) {
Expand Down
Loading

0 comments on commit 117bbf7

Please sign in to comment.