Skip to content

Commit

Permalink
Remove language codes from text strings.
Browse files Browse the repository at this point in the history
And take care to have an even number of bytes with utf16 strings.
  • Loading branch information
calixteman committed Nov 25, 2023
1 parent 5831636 commit eb5f610
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 2 deletions.
23 changes: 21 additions & 2 deletions src/shared/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -905,12 +905,21 @@ const PDFStringTranslateTable = [
];

function stringToPDFString(str) {
// See section 7.9.2.2 Text String Type.
// The string can contain some language codes bracketed with 0x0b,
// so we must remove them.
if (str[0] >= "\xEF") {
let encoding;
if (str[0] === "\xFE" && str[1] === "\xFF") {
encoding = "utf-16be";
if (str.length % 2 === 1) {
str = str.slice(0, -1);
}
} else if (str[0] === "\xFF" && str[1] === "\xFE") {
encoding = "utf-16le";
if (str.length % 2 === 1) {
str = str.slice(0, -1);
}
} else if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") {
encoding = "utf-8";
}
Expand All @@ -919,7 +928,11 @@ function stringToPDFString(str) {
try {
const decoder = new TextDecoder(encoding, { fatal: true });
const buffer = stringToBytes(str);
return decoder.decode(buffer);
const decoded = decoder.decode(buffer);
if (!decoded.includes("\x1b")) {
return decoded;
}
return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, "");
} catch (ex) {
warn(`stringToPDFString: "${ex}".`);
}
Expand All @@ -928,7 +941,13 @@ function stringToPDFString(str) {
// ISO Latin 1
const strBuf = [];
for (let i = 0, ii = str.length; i < ii; i++) {
const code = PDFStringTranslateTable[str.charCodeAt(i)];
const charCode = str.charCodeAt(i);
if (charCode === 0x1b) {
// eslint-disable-next-line no-empty
while (++i < ii && str.charCodeAt(i) !== 0x1b) {}
continue;
}
const code = PDFStringTranslateTable[charCode];
strBuf.push(code ? String.fromCharCode(code) : str.charAt(i));
}
return strBuf.join("");
Expand Down
26 changes: 26 additions & 0 deletions test/unit/util_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,21 @@ describe("util", function () {
expect(stringToPDFString(str)).toEqual("string");
});

it("handles incomplete UTF-16 big-endian strings", function () {
const str = "\xFE\xFF\x00\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00";
expect(stringToPDFString(str)).toEqual("strin");
});

it("handles UTF-16 little-endian strings", function () {
const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67\x00";
expect(stringToPDFString(str)).toEqual("string");
});

it("handles incomplete UTF-16 little-endian strings", function () {
const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67";
expect(stringToPDFString(str)).toEqual("strin");
});

it("handles UTF-8 strings", function () {
const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67";
expect(stringToPDFString(simpleStr)).toEqual("string");
Expand Down Expand Up @@ -134,6 +144,22 @@ describe("util", function () {
const str4 = "\xEF\xBB\xBF";
expect(stringToPDFString(str4)).toEqual("");
});

it("handles strings with language code", function () {
// ISO Latin 1
const str1 = "hello \x1benUS\x1bworld";
expect(stringToPDFString(str1)).toEqual("hello world");

// UTF-16BE
const str2 =
"\xFE\xFF\x00h\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d";
expect(stringToPDFString(str2)).toEqual("hello world");

// UTF-16LE
const str3 =
"\xFF\xFEh\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d\x00";
expect(stringToPDFString(str3)).toEqual("hello world");
});
});

describe("ReadableStream", function () {
Expand Down

0 comments on commit eb5f610

Please sign in to comment.