From 7e06c40c0f19a0008baa885a7740f8968866e2f0 Mon Sep 17 00:00:00 2001 From: Andrew Dillon Date: Tue, 1 Oct 2019 14:46:52 -0500 Subject: [PATCH] Add metadata methods to PDFDocument (#204) * UTF-8 encode strings * Add unit tests for utf8Encode * Add another utf8Encode test * Supporting UTF-16 encoding of PDFHexStrings * Add tests for PDFHexString and PDFString * Add PDFDocument metadata methods * Add doc comments to PDFDocument metadata methods * Update scratchpad * Update test apps --- apps/node/tests/test1.ts | 9 + apps/rn/src/tests/test1.js | 9 + apps/web/test1.html | 9 + scratchpad/index.ts | 35 +++- src/api/PDFDocument.ts | 144 ++++++++++++- src/api/PDFPage.ts | 1 + src/core/embedders/CMap.ts | 32 +-- src/core/objects/PDFHexString.ts | 17 +- src/core/objects/PDFString.ts | 12 +- src/utils/index.ts | 1 + src/utils/unicode.ts | 262 ++++++++++++++++++++++++ tests/core/objects/PDFHexString.spec.ts | 10 + tests/core/objects/PDFString.spec.ts | 8 + tests/utils/unicode.spec.ts | 162 +++++++++++++++ 14 files changed, 681 insertions(+), 30 deletions(-) create mode 100644 src/utils/unicode.ts create mode 100644 tests/utils/unicode.spec.ts diff --git a/apps/node/tests/test1.ts b/apps/node/tests/test1.ts index bf46284c8..40b494689 100644 --- a/apps/node/tests/test1.ts +++ b/apps/node/tests/test1.ts @@ -32,6 +32,15 @@ const ipsumLines = [ export default async (assets: Assets) => { const pdfDoc = await PDFDocument.create(); + pdfDoc.setTitle('🥚 The Life of an Egg 🍳'); + pdfDoc.setAuthor('Humpty Dumpty'); + pdfDoc.setSubject('📘 An Epic Tale of Woe 📖'); + pdfDoc.setKeywords(['eggs', 'wall', 'fall', 'king', 'horses', 'men']); + pdfDoc.setProducer('PDF App 9000 🤖'); + pdfDoc.setCreator('PDF App 9000 🤖'); + pdfDoc.setCreationDate(new Date('2018-06-24T01:58:37.228Z')); + pdfDoc.setModificationDate(new Date('2018-12-21T07:00:11.000Z')); + pdfDoc.registerFontkit(fontkit); const timesRomanFont = await pdfDoc.embedFont(StandardFonts.TimesRoman); diff --git a/apps/rn/src/tests/test1.js b/apps/rn/src/tests/test1.js index fc1645747..fa807d595 100644 --- a/apps/rn/src/tests/test1.js +++ b/apps/rn/src/tests/test1.js @@ -33,6 +33,15 @@ const ipsumLines = [ export default async () => { const pdfDoc = await PDFDocument.create(); + pdfDoc.setTitle('🥚 The Life of an Egg 🍳'); + pdfDoc.setAuthor('Humpty Dumpty'); + pdfDoc.setSubject('📘 An Epic Tale of Woe 📖'); + pdfDoc.setKeywords(['eggs', 'wall', 'fall', 'king', 'horses', 'men']); + pdfDoc.setProducer('PDF App 9000 🤖'); + pdfDoc.setCreator('PDF App 9000 🤖'); + pdfDoc.setCreationDate(new Date('2018-06-24T01:58:37.228Z')); + pdfDoc.setModificationDate(new Date('2018-12-21T07:00:11.000Z')); + pdfDoc.registerFontkit(fontkit); const timesRomanFont = await pdfDoc.embedFont(StandardFonts.TimesRoman); diff --git a/apps/web/test1.html b/apps/web/test1.html index bbeedd24f..db022e1bc 100644 --- a/apps/web/test1.html +++ b/apps/web/test1.html @@ -81,6 +81,15 @@ const pdfDoc = await PDFDocument.create(); + pdfDoc.setTitle('🥚 The Life of an Egg 🍳'); + pdfDoc.setAuthor('Humpty Dumpty'); + pdfDoc.setSubject('📘 An Epic Tale of Woe 📖'); + pdfDoc.setKeywords(['eggs', 'wall', 'fall', 'king', 'horses', 'men']); + pdfDoc.setProducer('PDF App 9000 🤖'); + pdfDoc.setCreator('PDF App 9000 🤖'); + pdfDoc.setCreationDate(new Date('2018-06-24T01:58:37.228Z')); + pdfDoc.setModificationDate(new Date('2018-12-21T07:00:11.000Z')); + pdfDoc.registerFontkit(fontkit); const timesRomanFont = await pdfDoc.embedFont(StandardFonts.TimesRoman); diff --git a/scratchpad/index.ts b/scratchpad/index.ts index 71f8957fb..08d8c97b0 100644 --- a/scratchpad/index.ts +++ b/scratchpad/index.ts @@ -1,15 +1,40 @@ import fs from 'fs'; import { openPdf, Reader } from './open'; -import { PDFDocument } from 'src/index'; +import { PDFDocument, StandardFonts } from 'src/index'; (async () => { - const pdfDoc = await PDFDocument.load( - fs.readFileSync('assets/pdfs/normal.pdf'), + const pdfDoc = await PDFDocument.create(); + + const timesRomanFont = await pdfDoc.embedFont(StandardFonts.TimesRoman); + const helveticaFont = await pdfDoc.embedFont(StandardFonts.Helvetica); + + const page = pdfDoc.addPage([500, 600]); + + page.setFont(timesRomanFont); + page.drawText('The Life of an Egg', { x: 60, y: 500, size: 50 }); + page.drawText('An Epic Tale of Woe', { x: 125, y: 460, size: 25 }); + + page.setFont(helveticaFont); + page.drawText( + [ + 'Humpty Dumpty sat on a wall', + 'Humpty Dumpty had a great fall;', + `All the king's horses and all the king's men`, + `Couldn't put Humpty together again.`, + ].join('\n'), + { x: 75, y: 275, size: 20, lineHeight: 25 }, ); + page.drawText('- Humpty Dumpty', { x: 250, y: 150, size: 20 }); - console.log('Count:', pdfDoc.getPageCount()); - pdfDoc.removePage(1); + pdfDoc.setTitle('🥚 The Life of an Egg 🍳'); + pdfDoc.setAuthor('Humpty Dumpty'); + pdfDoc.setSubject('📘 An Epic Tale of Woe 📖'); + pdfDoc.setKeywords(['eggs', 'wall', 'fall', 'king', 'horses', 'men']); + pdfDoc.setProducer('PDF App 9000 🤖'); + pdfDoc.setCreator('pdf-lib (https://github.com/Hopding/pdf-lib)'); + pdfDoc.setCreationDate(new Date('2018-06-24T01:58:37.228Z')); + pdfDoc.setModificationDate(new Date('2019-12-21T07:00:11.000Z')); const pdfBytes = await pdfDoc.save(); diff --git a/src/api/PDFDocument.ts b/src/api/PDFDocument.ts index 9eaf22f7c..a379f12e1 100644 --- a/src/api/PDFDocument.ts +++ b/src/api/PDFDocument.ts @@ -15,11 +15,15 @@ import { JpegEmbedder, PDFCatalog, PDFContext, + PDFDict, + PDFHexString, + PDFName, PDFObjectCopier, PDFPageLeaf, PDFPageTree, PDFParser, PDFStreamWriter, + PDFString, PDFWriter, PngEmbedder, StandardFontEmbedder, @@ -182,6 +186,8 @@ export default class PDFDocument { this.images = []; if (!ignoreEncryption && this.isEncrypted) throw new EncryptedPDFError(); + + this.updateInfoDict(); } /** @@ -197,10 +203,123 @@ export default class PDFDocument { this.fontkit = fontkit; } + /** + * Set this document's title metadata. The title will appear in the + * "Document Properties" section of most PDF readers. For example: + * ```js + * pdfDoc.setTitle('🥚 The Life of an Egg 🍳') + * ``` + * @param title The title of this document. + */ + setTitle(title: string): void { + assertIs(title, 'title', ['string']); + const key = PDFName.of('Title'); + this.getInfoDict().set(key, PDFHexString.fromText(title)); + } + + /** + * Set this document's author metadata. The author will appear in the + * "Document Properties" section of most PDF readers. For example: + * ```js + * pdfDoc.setAuthor('Humpty Dumpty') + * ``` + * @param author The author of this document. + */ + setAuthor(author: string): void { + assertIs(author, 'author', ['string']); + const key = PDFName.of('Author'); + this.getInfoDict().set(key, PDFHexString.fromText(author)); + } + + /** + * Set this document's subject metadata. The subject will appear in the + * "Document Properties" section of most PDF readers. For example: + * ```js + * pdfDoc.setSubject('📘 An Epic Tale of Woe 📖') + * ``` + * @param subject The subject of this document. + */ + setSubject(subject: string): void { + assertIs(subject, 'author', ['string']); + const key = PDFName.of('Subject'); + this.getInfoDict().set(key, PDFHexString.fromText(subject)); + } + + /** + * Set this document's keyword metadata. These keywords will appear in the + * "Document Properties" section of most PDF readers. For example: + * ```js + * pdfDoc.setKeywords(['eggs', 'wall', 'fall', 'king', 'horses', 'men']) + * ``` + * @param keywords An array of keywords associated with this document. + */ + setKeywords(keywords: string[]): void { + assertIs(keywords, 'keywords', [Array]); + const key = PDFName.of('Keywords'); + this.getInfoDict().set(key, PDFHexString.fromText(keywords.join(' '))); + } + + /** + * Set this document's creator metadata. The creator will appear in the + * "Document Properties" section of most PDF readers. For example: + * ```js + * pdfDoc.setCreator('PDF App 9000 🤖') + * ``` + * @param creator The creator of this document. + */ + setCreator(creator: string): void { + assertIs(creator, 'creator', ['string']); + const key = PDFName.of('Creator'); + this.getInfoDict().set(key, PDFHexString.fromText(creator)); + } + + /** + * Set this document's producer metadata. The producer will appear in the + * "Document Properties" section of most PDF readers. For example: + * ```js + * pdfDoc.setProducer('PDF App 9000 🤖') + * ``` + * @param producer The producer of this document. + */ + setProducer(producer: string): void { + assertIs(producer, 'creator', ['string']); + const key = PDFName.of('Producer'); + this.getInfoDict().set(key, PDFHexString.fromText(producer)); + } + + /** + * Set this document's creation date metadata. The creation date will appear + * in the "Document Properties" section of most PDF readers. For example: + * ```js + * pdfDoc.setCreationDate(new Date()) + * ``` + * @param creationDate The date this document was created. + */ + setCreationDate(creationDate: Date): void { + assertIs(creationDate, 'creationDate', [[Date, 'Date']]); + const key = PDFName.of('CreationDate'); + this.getInfoDict().set(key, PDFString.fromDate(creationDate)); + } + + /** + * Set this document's modification date metadata. The modification date will + * appear in the "Document Properties" section of most PDF readers. For + * example: + * ```js + * pdfDoc.setModificationDate(new Date()) + * ``` + * @param modificationDate The date this document was last modified. + */ + setModificationDate(modificationDate: Date): void { + assertIs(modificationDate, 'modificationDate', [[Date, 'Date']]); + const key = PDFName.of('ModDate'); + this.getInfoDict().set(key, PDFString.fromDate(modificationDate)); + } + /** * Get the number of pages contained in this document. For example: * ```js - * const totalPages = pdfDoc.getPageCount(); + * const totalPages = pdfDoc.getPageCount() * ``` * @returns The number of pages in this document. */ @@ -639,6 +758,29 @@ export default class PDFDocument { return dataUri ? `data:application/pdf;base64,${base64}` : base64; } + private updateInfoDict(): void { + const pdfLib = `pdf-lib (https://github.com/Hopding/pdf-lib)`; + const now = new Date(); + + const info = this.getInfoDict(); + + this.setProducer(pdfLib); + this.setModificationDate(now); + + if (!info.get(PDFName.of('Creator'))) this.setCreator(pdfLib); + if (!info.get(PDFName.of('CreationDate'))) this.setCreationDate(now); + } + + private getInfoDict(): PDFDict { + const existingInfo = this.context.lookup(this.context.trailerInfo.Info); + if (existingInfo instanceof PDFDict) return existingInfo; + + const newInfo = this.context.obj({}); + this.context.trailerInfo.Info = this.context.register(newInfo); + + return newInfo; + } + private assertFontkit(): Fontkit { if (!this.fontkit) throw new FontkitNotRegisteredError(); return this.fontkit; diff --git a/src/api/PDFPage.ts b/src/api/PDFPage.ts index a4dcde48f..b1282e26d 100644 --- a/src/api/PDFPage.ts +++ b/src/api/PDFPage.ts @@ -487,6 +487,7 @@ export default class PDFPage { /** * Change the default position of this page to be further right on the y-axis. + * For example: * ```js * page.moveTo(50, 50) * page.drawText('I will be drawn at (50, 50)') diff --git a/src/core/embedders/CMap.ts b/src/core/embedders/CMap.ts index d53118c7e..d63f67c32 100644 --- a/src/core/embedders/CMap.ts +++ b/src/core/embedders/CMap.ts @@ -1,6 +1,12 @@ import { Glyph } from 'src/types/fontkit'; -import { toHexStringOfMinLength } from 'src/utils'; +import { toHexString, toHexStringOfMinLength } from 'src/utils'; +import { + hasSurrogates, + highSurrogate, + isWithinBMP, + lowSurrogate, +} from 'src/utils/unicode'; /** [[start, end], mappings] */ type BfRange = [[string, string], string[]]; @@ -76,33 +82,15 @@ const cmapHexFormat = (...values: string[]) => `<${values.join('')}>`; const cmapHexString = (value: number) => toHexStringOfMinLength(value, 4); const cmapCodePointFormat = (codePoint: number) => { - if (isUtf8CodePoint(codePoint)) return cmapHexString(codePoint); + if (isWithinBMP(codePoint)) return cmapHexString(codePoint); - if (isUtf16CodePoint(codePoint)) { + if (hasSurrogates(codePoint)) { const hs = highSurrogate(codePoint); const ls = lowSurrogate(codePoint); return `${cmapHexString(hs)}${cmapHexString(ls)}`; } - const hex = codePoint.toString(16); + const hex = toHexString(codePoint); const msg = `0x${hex} is not a valid UTF-8 or UTF-16 codepoint.`; throw new Error(msg); }; - -// From: https://en.wikipedia.org/wiki/UTF-16#Description -const isUtf8CodePoint = (codePoint: number) => - codePoint >= 0 && codePoint <= 0xffff; - -// From: https://en.wikipedia.org/wiki/UTF-16#Description -const isUtf16CodePoint = (codePoint: number) => - codePoint >= 0x010000 && codePoint <= 0x10ffff; - -// From Unicode 3.0 spec, section 3.7: -// http://unicode.org/versions/Unicode3.0.0/ch03.pdf -const highSurrogate = (codePoint: number) => - Math.floor((codePoint - 0x10000) / 0x400) + 0xd800; - -// From Unicode 3.0 spec, section 3.7: -// http://unicode.org/versions/Unicode3.0.0/ch03.pdf -const lowSurrogate = (codePoint: number) => - ((codePoint - 0x10000) % 0x400) + 0xdc00; diff --git a/src/core/objects/PDFHexString.ts b/src/core/objects/PDFHexString.ts index df3a3fba8..9b604612f 100644 --- a/src/core/objects/PDFHexString.ts +++ b/src/core/objects/PDFHexString.ts @@ -1,10 +1,25 @@ import PDFObject from 'src/core/objects/PDFObject'; import CharCodes from 'src/core/syntax/CharCodes'; -import { copyStringIntoBuffer } from 'src/utils'; +import { + copyStringIntoBuffer, + toHexStringOfMinLength, + utf16Encode, +} from 'src/utils'; class PDFHexString extends PDFObject { static of = (value: string) => new PDFHexString(value); + static fromText = (value: string) => { + const encoded = utf16Encode(value); + + let hex = ''; + for (let idx = 0, len = encoded.length; idx < len; idx++) { + hex += toHexStringOfMinLength(encoded[idx], 4); + } + + return new PDFHexString(hex); + }; + private readonly value: string; constructor(value: string) { diff --git a/src/core/objects/PDFString.ts b/src/core/objects/PDFString.ts index 79bd0a811..ffc51916e 100644 --- a/src/core/objects/PDFString.ts +++ b/src/core/objects/PDFString.ts @@ -1,6 +1,6 @@ import PDFObject from 'src/core/objects/PDFObject'; import CharCodes from 'src/core/syntax/CharCodes'; -import { copyStringIntoBuffer } from 'src/utils'; +import { copyStringIntoBuffer, padStart } from 'src/utils'; class PDFString extends PDFObject { // The PDF spec allows newlines and parens to appear directly within a literal @@ -8,6 +8,16 @@ class PDFString extends PDFObject { // for simplicity, we will not bother escaping them. static of = (value: string) => new PDFString(value); + static fromDate = (date: Date) => { + const year = padStart(String(date.getUTCFullYear()), 4, '0'); + const month = padStart(String(date.getUTCMonth() + 1), 2, '0'); + const day = padStart(String(date.getUTCDate()), 2, '0'); + const hours = padStart(String(date.getUTCHours()), 2, '0'); + const mins = padStart(String(date.getUTCMinutes()), 2, '0'); + const secs = padStart(String(date.getUTCSeconds()), 2, '0'); + return new PDFString(`D:${year}${month}${day}${hours}${mins}${secs}Z`); + }; + private readonly value: string; private constructor(value: string) { diff --git a/src/utils/index.ts b/src/utils/index.ts index b2b25bd5d..9f6f8c4c1 100644 --- a/src/utils/index.ts +++ b/src/utils/index.ts @@ -1,6 +1,7 @@ export * from 'src/utils/arrays'; export * from 'src/utils/async'; export * from 'src/utils/strings'; +export * from 'src/utils/unicode'; export * from 'src/utils/numbers'; export * from 'src/utils/errors'; export * from 'src/utils/base64'; diff --git a/src/utils/unicode.ts b/src/utils/unicode.ts new file mode 100644 index 000000000..5e9689870 --- /dev/null +++ b/src/utils/unicode.ts @@ -0,0 +1,262 @@ +import { toHexString } from 'src/utils/strings'; + +/** + * Encodes a string to UTF-8. + * + * @param input The string to be encoded. + * @param byteOrderMark Whether or not a byte order marker (BOM) should be added + * to the start of the encoding. (default `true`) + * @returns A Uint8Array containing the UTF-8 encoding of the input string. + * + * ----------------------------------------------------------------------------- + * + * JavaScript strings are composed of Unicode code points. Code points are + * integers in the range 0 to 1,114,111 (0x10FFFF). When serializing a string, + * it must be encoded as a sequence of words. A word is typically 8, 16, or 32 + * bytes in size. As such, Unicode defines three encoding forms: UTF-8, UTF-16, + * and UTF-32. These encoding forms are described in the Unicode standard [1]. + * This function implements the UTF-8 encoding form. + * + * ----------------------------------------------------------------------------- + * + * In UTF-8, each code point is mapped to a sequence of 1, 2, 3, or 4 bytes. + * Note that the logic which defines this mapping is slightly convoluted, and + * not as straightforward as the mapping logic for UTF-16 or UTF-32. The UTF-8 + * mapping logic is as follows [2]: + * + * • If a code point is in the range U+0000..U+007F, then view it as a 7-bit + * integer: 0bxxxxxxx. Map the code point to 1 byte with the first high order + * bit set to 0: + * + * b1=0b0xxxxxxx + * + * • If a code point is in the range U+0080..U+07FF, then view it as an 11-bit + * integer: 0byyyyyxxxxxx. Map the code point to 2 bytes with the first 5 bits + * of the code point stored in the first byte, and the last 6 bits stored in + * the second byte: + * + * b1=0b110yyyyy b2=0b10xxxxxx + * + * • If a code point is in the range U+0800..U+FFFF, then view it as a 16-bit + * integer, 0bzzzzyyyyyyxxxxxx. Map the code point to 3 bytes with the first + * 4 bits stored in the first byte, the next 6 bits stored in the second byte, + * and the last 6 bits in the third byte: + * + * b1=0b1110zzzz b2=0b10yyyyyy b3=0b10xxxxxx + * + * • If a code point is in the range U+10000...U+10FFFF, then view it as a + * 21-bit integer, 0bvvvzzzzzzyyyyyyxxxxxx. Map the code point to 4 bytes with + * the first 3 bits stored in the first byte, the next 6 bits stored in the + * second byte, the next 6 bits stored in the third byte, and the last 6 bits + * stored in the fourth byte: + * + * b1=0b11110xxx b2=0b10zzzzzz b3=0b10yyyyyy b4=0b10xxxxxx + * + * ----------------------------------------------------------------------------- + * + * It is important to note, when iterating through the code points of a string + * in JavaScript, that if a character is encoded as a surrogate pair it will + * increase the string's length by 2 instead of 1 [4]. For example: + * + * ``` + * > 'a'.length + * 1 + * > '💩'.length + * 2 + * > '語'.length + * 1 + * > 'a💩語'.length + * 4 + * ``` + * + * The results of the above example are explained by the fact that the + * characters 'a' and '語' are not represented by surrogate pairs, but '💩' is. + * + * Because of this idiosyncrasy in JavaScript's string implementation and APIs, + * we must "jump" an extra index after encoding a character as a surrogate + * pair. In practice, this means we must increment the index of our for loop by + * 2 if we encode a surrogate pair, and 1 in all other cases. + * + * ----------------------------------------------------------------------------- + * + * References: + * - [1] https://www.unicode.org/versions/Unicode12.0.0/UnicodeStandard-12.0.pdf + * 3.9 Unicode Encoding Forms - UTF-8 + * - [2] http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding.html + * - [3] http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html + * - [4] https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/length#Description + * + */ +export const utf8Encode = (input: string, byteOrderMark = true): Uint8Array => { + const encoded = []; + + if (byteOrderMark) encoded.push(0xef, 0xbb, 0xbf); + + for (let idx = 0, len = input.length; idx < len; ) { + const codePoint = input.codePointAt(idx)!; + + // One byte encoding + if (codePoint < 0x80) { + const byte1 = codePoint & 0x7f; + encoded.push(byte1); + idx += 1; + } + + // Two byte encoding + else if (codePoint < 0x0800) { + const byte1 = ((codePoint >> 6) & 0x1f) | 0xc0; + const byte2 = (codePoint & 0x3f) | 0x80; + encoded.push(byte1, byte2); + idx += 1; + } + + // Three byte encoding + else if (codePoint < 0x010000) { + const byte1 = ((codePoint >> 12) & 0x0f) | 0xe0; + const byte2 = ((codePoint >> 6) & 0x3f) | 0x80; + const byte3 = (codePoint & 0x3f) | 0x80; + encoded.push(byte1, byte2, byte3); + idx += 1; + } + + // Four byte encoding (surrogate pair) + else if (codePoint < 0x110000) { + const byte1 = ((codePoint >> 18) & 0x07) | 0xf0; + const byte2 = ((codePoint >> 12) & 0x3f) | 0x80; + const byte3 = ((codePoint >> 6) & 0x3f) | 0x80; + const byte4 = ((codePoint >> 0) & 0x3f) | 0x80; + encoded.push(byte1, byte2, byte3, byte4); + idx += 2; + } + + // Should never reach this case + else throw new Error(`Invalid code point: 0x${toHexString(codePoint)}`); + } + + return new Uint8Array(encoded); +}; + +/** + * Encodes a string to UTF-16. + * + * @param input The string to be encoded. + * @param byteOrderMark Whether or not a byte order marker (BOM) should be added + * to the start of the encoding. (default `true`) + * @returns A Uint16Array containing the UTF-16 encoding of the input string. + * + * ----------------------------------------------------------------------------- + * + * JavaScript strings are composed of Unicode code points. Code points are + * integers in the range 0 to 1,114,111 (0x10FFFF). When serializing a string, + * it must be encoded as a sequence of words. A word is typically 8, 16, or 32 + * bytes in size. As such, Unicode defines three encoding forms: UTF-8, UTF-16, + * and UTF-32. These encoding forms are described in the Unicode standard [1]. + * This function implements the UTF-16 encoding form. + * + * ----------------------------------------------------------------------------- + * + * In UTF-16, each code point is mapped to one or two 16-bit integers. The + * UTF-16 mapping logic is as follows [2]: + * + * • If a code point is in the range U+0000..U+FFFF, then map the code point to + * a 16-bit integer with the most significant byte first. + * + * • If a code point is in the range U+10000..U+10000, then map the code point + * to two 16-bit integers. The first integer should contain the high surrogate + * and the second integer should contain the low surrogate. Both surrogates + * should be written with the most significant byte first. + * + * ----------------------------------------------------------------------------- + * + * It is important to note, when iterating through the code points of a string + * in JavaScript, that if a character is encoded as a surrogate pair it will + * increase the string's length by 2 instead of 1 [4]. For example: + * + * ``` + * > 'a'.length + * 1 + * > '💩'.length + * 2 + * > '語'.length + * 1 + * > 'a💩語'.length + * 4 + * ``` + * + * The results of the above example are explained by the fact that the + * characters 'a' and '語' are not represented by surrogate pairs, but '💩' is. + * + * Because of this idiosyncrasy in JavaScript's string implementation and APIs, + * we must "jump" an extra index after encoding a character as a surrogate + * pair. In practice, this means we must increment the index of our for loop by + * 2 if we encode a surrogate pair, and 1 in all other cases. + * + * ----------------------------------------------------------------------------- + * + * References: + * - [1] https://www.unicode.org/versions/Unicode12.0.0/UnicodeStandard-12.0.pdf + * 3.9 Unicode Encoding Forms - UTF-8 + * - [2] http://www.herongyang.com/Unicode/UTF-16-UTF-16-Encoding.html + * - [3] https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/length#Description + * + */ +export const utf16Encode = ( + input: string, + byteOrderMark = true, +): Uint16Array => { + const encoded = []; + + if (byteOrderMark) encoded.push(0xfeff); + + for (let idx = 0, len = input.length; idx < len; ) { + const codePoint = input.codePointAt(idx)!; + + // Two byte encoding + if (codePoint < 0x010000) { + encoded.push(codePoint); + idx += 1; + } + + // Four byte encoding (surrogate pair) + else if (codePoint < 0x110000) { + encoded.push(highSurrogate(codePoint), lowSurrogate(codePoint)); + idx += 2; + } + + // Should never reach this case + else throw new Error(`Invalid code point: 0x${toHexString(codePoint)}`); + } + + return new Uint16Array(encoded); +}; + +/** + * Returns `true` if the `codePoint` is within the + * Basic Multilingual Plane (BMP). Code points inside the BMP are not encoded + * with surrogate pairs. + * @param codePoint The code point to be evaluated. + * + * Reference: https://en.wikipedia.org/wiki/UTF-16#Description + */ +export const isWithinBMP = (codePoint: number) => + codePoint >= 0 && codePoint <= 0xffff; + +/** + * Returns `true` if the given `codePoint` is valid and must be represented + * with a surrogate pair when encoded. + * @param codePoint The code point to be evaluated. + * + * Reference: https://en.wikipedia.org/wiki/UTF-16#Description + */ +export const hasSurrogates = (codePoint: number) => + codePoint >= 0x010000 && codePoint <= 0x10ffff; + +// From Unicode 3.0 spec, section 3.7: +// http://unicode.org/versions/Unicode3.0.0/ch03.pdf +export const highSurrogate = (codePoint: number) => + Math.floor((codePoint - 0x10000) / 0x400) + 0xd800; + +// From Unicode 3.0 spec, section 3.7: +// http://unicode.org/versions/Unicode3.0.0/ch03.pdf +export const lowSurrogate = (codePoint: number) => + ((codePoint - 0x10000) % 0x400) + 0xdc00; diff --git a/tests/core/objects/PDFHexString.spec.ts b/tests/core/objects/PDFHexString.spec.ts index 7edd2b910..26609b493 100644 --- a/tests/core/objects/PDFHexString.spec.ts +++ b/tests/core/objects/PDFHexString.spec.ts @@ -8,6 +8,16 @@ describe(`PDFHexString`, () => { expect(PDFHexString.of('901FA')).toBeInstanceOf(PDFHexString); }); + it(`can be constructed from a string of text (using UTF-16BE encoding)`, () => { + expect(String(PDFHexString.fromText(''))).toBe(''); + expect(String(PDFHexString.fromText('ä☺𠜎️☁️💩'))).toBe( + '', + ); + expect(String(PDFHexString.fromText('stuff 💩 and 🎂things'))).toBe( + '', + ); + }); + it(`can be cloned`, () => { const original = PDFHexString.of('901FA'); const clone = original.clone(); diff --git a/tests/core/objects/PDFString.spec.ts b/tests/core/objects/PDFString.spec.ts index 8661d1959..4a62d7892 100644 --- a/tests/core/objects/PDFString.spec.ts +++ b/tests/core/objects/PDFString.spec.ts @@ -8,6 +8,14 @@ describe(`PDFString`, () => { expect(PDFString.of(')b\\a/z(')).toBeInstanceOf(PDFString); }); + it(`can be constructed from a Date object`, () => { + const date1 = new Date('2018-06-24T01:58:37.228Z'); + expect(String(PDFString.fromDate(date1))).toBe('(D:20180624015837Z)'); + + const date2 = new Date('2019-12-21T07:00:11.000Z'); + expect(String(PDFString.fromDate(date2))).toBe('(D:20191221070011Z)'); + }); + it(`can be cloned`, () => { const original = PDFString.of(')b\\a/z('); const clone = original.clone(); diff --git a/tests/utils/unicode.spec.ts b/tests/utils/unicode.spec.ts new file mode 100644 index 000000000..ef2c829ea --- /dev/null +++ b/tests/utils/unicode.spec.ts @@ -0,0 +1,162 @@ +import { mergeIntoTypedArray, utf16Encode, utf8Encode } from 'src/utils'; + +const utf8BOM = new Uint8Array([0xef, 0xbb, 0xbf]); +const utf16BOM = new Uint16Array([0xfeff]); + +const withUtf8Bom = (encoding: Uint8Array) => + mergeIntoTypedArray(utf8BOM, encoding); + +const withUtf16Bom = (encoding: Uint16Array) => + new Uint16Array([...Array.from(utf16BOM), ...Array.from(encoding)]); + +describe(`utf8Encode`, () => { + it(`encodes to UTF-8`, () => { + const input = '\u{004D}\u{0430}\u{4E8C}\u{10302}'; + + // prettier-ignore + const expected = new Uint8Array([ + /* U+004D */ 0x4d, + /* U+0430 */ 0xd0, 0xb0, + /* U+4E8C */ 0xe4, 0xba, 0x8c, + /* U+10302 */ 0xf0, 0x90, 0x8c, 0x82, + ]); + + const actual = utf8Encode(input); + + expect(actual).toEqual(withUtf8Bom(expected)); + }); + + it(`encodes to UTF-8`, () => { + const input = '\u{004D}\u{0061}\u{10000}'; + + // prettier-ignore + const expected = new Uint8Array([ + /* U+004D */ 0x4d, + /* U+0061 */ 0x61, + /* U+10000 */ 0xf0, 0x90, 0x80, 0x80, + ]); + + const actual = utf8Encode(input); + + expect(actual).toEqual(withUtf8Bom(expected)); + }); + + it(`encodes to UTF-8 (without a BOM)`, () => { + const input = '💩🎂'; + + // prettier-ignore + const expected = new Uint8Array([ + /* U+1F4A9 */ 0xf0, 0x9f, 0x92, 0xa9, + /* U+1F382 */ 0xf0, 0x9f, 0x8e, 0x82, + ]); + + const actual = utf8Encode(input, false); + + expect(actual).toEqual(expected); + }); + + it(`encodes "Дмитрий Козлюк (Dmitry Kozlyuk)" to UTF-8`, () => { + const input = 'Дмитрий Козлюк (Dmitry Kozlyuk)'; + + // prettier-ignore + const expected = new Uint8Array([ + 0xd0, 0x94, 0xd0, 0xbc, 0xd0, 0xb8, 0xd1, 0x82, 0xd1, 0x80, 0xd0, 0xb8, + 0xd0, 0xb9, 0x20, 0xd0, 0x9a, 0xd0, 0xbe, 0xd0, 0xb7, 0xd0, 0xbb, 0xd1, + 0x8e, 0xd0, 0xba, 0x20, 0x28, 0x44, 0x6d, 0x69, 0x74, 0x72, 0x79, 0x20, + 0x4b, 0x6f, 0x7a, 0x6c, 0x79, 0x75, 0x6b, 0x29, + ]); + + const actual = utf8Encode(input); + + expect(actual).toEqual(withUtf8Bom(expected)); + }); + + it(`encodes "ä☺𠜎️☁️" to UTF-8 (without a BOM)`, () => { + const input = 'ä☺𠜎️☁️'; + + // prettier-ignore + const expected = new Uint8Array([ + 0xc3, 0xa4, 0xe2, 0x98, 0xba, 0xf0, 0xa0, 0x9c, 0x8e, 0xef, 0xb8, 0x8f, + 0xe2, 0x98, 0x81, 0xef, 0xb8, 0x8f, + ]); + + const actual = utf8Encode(input, false); + + expect(actual).toEqual(expected); + }); +}); + +describe(`utf16Encode`, () => { + it(`encodes to UTF-16`, () => { + const input = '\u{004D}\u{0430}\u{4E8C}\u{10302}'; + + // prettier-ignore + const expected = new Uint16Array(new Uint8Array([ + /* U+004D */ 0x4d, 0x00, + /* U+0430 */ 0x30, 0x04, + /* U+4E8C */ 0x8c, 0x4e, + /* U+10302 */ 0x00, 0xd8, 0x02, 0xdf, + ]).buffer); + + const actual = utf16Encode(input); + + expect(actual).toEqual(withUtf16Bom(expected)); + }); + + it(`encodes to UTF-16`, () => { + const input = '\u{004D}\u{0061}\u{10000}'; + + // prettier-ignore + const expected = new Uint16Array(new Uint8Array([ + /* U+004D */ 0x4d, 0x00, + /* U+0061 */ 0x61, 0x00, + /* U+10000 */ 0x00, 0xd8, 0x00, 0xdc, + ]).buffer); + + const actual = utf16Encode(input); + + expect(actual).toEqual(withUtf16Bom(expected)); + }); + + it(`encodes to UTF-16 (without a BOM)`, () => { + const input = '💩🎂'; + + // prettier-ignore + const expected = new Uint16Array(new Uint8Array([ + /* U+1F4A9 */ 0x3d, 0xd8, 0xa9, 0xdc, + /* U+1F382 */ 0x3c, 0xd8, 0x82, 0xdf, + ]).buffer); + + const actual = utf16Encode(input, false); + + expect(actual).toEqual(expected); + }); + + it(`encodes "Дмитрий Козлюк (Dmitry Kozlyuk)" to UTF-16`, () => { + const input = 'Дмитрий Козлюк (Dmitry Kozlyuk)'; + + // prettier-ignore + const expected = new Uint16Array([ + 0x414, 0x43c, 0x438, 0x442, 0x440, 0x438, 0x439, 0x20, 0x41a, 0x43e, + 0x437, 0x43b, 0x44e, 0x43a, 0x20, 0x28, 0x44, 0x6d, 0x69, 0x74, 0x72, + 0x79, 0x20, 0x4b, 0x6f, 0x7a, 0x6c, 0x79, 0x75, 0x6b, 0x29, + ]); + + const actual = utf16Encode(input); + + expect(actual).toEqual(withUtf16Bom(expected)); + }); + + it(`encodes "ä☺𠜎️☁️" to UTF-16 (without a BOM)`, () => { + const input = 'ä☺𠜎️☁️'; + + // prettier-ignore + const expected = new Uint16Array([ + 0xe4, 0x263a, 55361, 57102, 0xfe0f, 0x2601, 0xfe0f, + ]); + + const actual = utf16Encode(input, false); + + expect(actual).toEqual(expected); + }); +});