From eb5f610d18a0613fce115452929ee3b4349eb38e Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Fri, 24 Nov 2023 20:17:15 +0100 Subject: [PATCH] Remove language codes from text strings. And take care to have an even number of bytes with utf16 strings. --- src/shared/util.js | 23 +++++++++++++++++++++-- test/unit/util_spec.js | 26 ++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/shared/util.js b/src/shared/util.js index 986931a051938..f2cd2446f5d0a 100644 --- a/src/shared/util.js +++ b/src/shared/util.js @@ -905,12 +905,21 @@ const PDFStringTranslateTable = [ ]; function stringToPDFString(str) { + // See section 7.9.2.2 Text String Type. + // The string can contain some language codes bracketed with 0x0b, + // so we must remove them. if (str[0] >= "\xEF") { let encoding; if (str[0] === "\xFE" && str[1] === "\xFF") { encoding = "utf-16be"; + if (str.length % 2 === 1) { + str = str.slice(0, -1); + } } else if (str[0] === "\xFF" && str[1] === "\xFE") { encoding = "utf-16le"; + if (str.length % 2 === 1) { + str = str.slice(0, -1); + } } else if (str[0] === "\xEF" && str[1] === "\xBB" && str[2] === "\xBF") { encoding = "utf-8"; } @@ -919,7 +928,11 @@ function stringToPDFString(str) { try { const decoder = new TextDecoder(encoding, { fatal: true }); const buffer = stringToBytes(str); - return decoder.decode(buffer); + const decoded = decoder.decode(buffer); + if (!decoded.includes("\x1b")) { + return decoded; + } + return decoded.replaceAll(/\x1b[^\x1b]*(?:\x1b|$)/g, ""); } catch (ex) { warn(`stringToPDFString: "${ex}".`); } @@ -928,7 +941,13 @@ function stringToPDFString(str) { // ISO Latin 1 const strBuf = []; for (let i = 0, ii = str.length; i < ii; i++) { - const code = PDFStringTranslateTable[str.charCodeAt(i)]; + const charCode = str.charCodeAt(i); + if (charCode === 0x1b) { + // eslint-disable-next-line no-empty + while (++i < ii && str.charCodeAt(i) !== 0x1b) {} + continue; + } + const code = PDFStringTranslateTable[charCode]; strBuf.push(code ? String.fromCharCode(code) : str.charAt(i)); } return strBuf.join(""); diff --git a/test/unit/util_spec.js b/test/unit/util_spec.js index ed6936e1a6b57..bae97af16b9f4 100644 --- a/test/unit/util_spec.js +++ b/test/unit/util_spec.js @@ -99,11 +99,21 @@ describe("util", function () { expect(stringToPDFString(str)).toEqual("string"); }); + it("handles incomplete UTF-16 big-endian strings", function () { + const str = "\xFE\xFF\x00\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00"; + expect(stringToPDFString(str)).toEqual("strin"); + }); + it("handles UTF-16 little-endian strings", function () { const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67\x00"; expect(stringToPDFString(str)).toEqual("string"); }); + it("handles incomplete UTF-16 little-endian strings", function () { + const str = "\xFF\xFE\x73\x00\x74\x00\x72\x00\x69\x00\x6E\x00\x67"; + expect(stringToPDFString(str)).toEqual("strin"); + }); + it("handles UTF-8 strings", function () { const simpleStr = "\xEF\xBB\xBF\x73\x74\x72\x69\x6E\x67"; expect(stringToPDFString(simpleStr)).toEqual("string"); @@ -134,6 +144,22 @@ describe("util", function () { const str4 = "\xEF\xBB\xBF"; expect(stringToPDFString(str4)).toEqual(""); }); + + it("handles strings with language code", function () { + // ISO Latin 1 + const str1 = "hello \x1benUS\x1bworld"; + expect(stringToPDFString(str1)).toEqual("hello world"); + + // UTF-16BE + const str2 = + "\xFE\xFF\x00h\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d"; + expect(stringToPDFString(str2)).toEqual("hello world"); + + // UTF-16LE + const str3 = + "\xFF\xFEh\x00e\x00l\x00l\x00o\x00 \x00\x1b\x00e\x00n\x00U\x00S\x00\x1b\x00w\x00o\x00r\x00l\x00d\x00"; + expect(stringToPDFString(str3)).toEqual("hello world"); + }); }); describe("ReadableStream", function () {