diff --git a/src/core/chunked_stream.js b/src/core/chunked_stream.js index efd6732167f16..c824ce74a04f0 100644 --- a/src/core/chunked_stream.js +++ b/src/core/chunked_stream.js @@ -12,6 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +/* eslint no-var: error */ import { arrayByteLength, arraysToBytes, createPromiseCapability, isEmptyObj, diff --git a/src/core/document.js b/src/core/document.js index bac2e281b6c8e..8d60d1a7e3c44 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -12,6 +12,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +/* eslint no-var: error */ import { assert, FormatError, getInheritableProperty, info, isArrayBuffer, isBool, @@ -28,18 +29,17 @@ import { OperatorList } from './operator_list'; import { PartialEvaluator } from './evaluator'; import { PDFFunctionFactory } from './function'; -var Page = (function PageClosure() { +const DEFAULT_USER_UNIT = 1.0; +const LETTER_SIZE_MEDIABOX = [0, 0, 612, 792]; - var DEFAULT_USER_UNIT = 1.0; - var LETTER_SIZE_MEDIABOX = [0, 0, 612, 792]; +function isAnnotationRenderable(annotation, intent) { + return (intent === 'display' && annotation.viewable) || + (intent === 'print' && annotation.printable); +} - function isAnnotationRenderable(annotation, intent) { - return (intent === 'display' && annotation.viewable) || - (intent === 'print' && annotation.printable); - } - - function Page({ pdfManager, xref, pageIndex, pageDict, ref, fontCache, - builtInCMapCache, pdfFunctionFactory, }) { +class Page { + constructor({ pdfManager, xref, pageIndex, pageDict, ref, fontCache, + builtInCMapCache, pdfFunctionFactory, }) { this.pdfManager = pdfManager; this.pageIndex = pageIndex; this.pageDict = pageDict; @@ -51,8 +51,8 @@ var Page = (function PageClosure() { this.evaluatorOptions = pdfManager.evaluatorOptions; this.resourcesPromise = null; - var uniquePrefix = 'p' + this.pageIndex + '_'; - var idCounters = { + const uniquePrefix = `p${this.pageIndex}_`; + const idCounters = { obj: 0, }; this.idFactory = { @@ -62,138 +62,211 @@ var Page = (function PageClosure() { }; } - Page.prototype = { - /** - * @private - */ - _getInheritableProperty(key, getArray = false) { - let value = getInheritableProperty({ dict: this.pageDict, key, getArray, + /** + * @private + */ + _getInheritableProperty(key, getArray = false) { + const value = getInheritableProperty({ dict: this.pageDict, key, getArray, stopWhenFound: false, }); - if (!Array.isArray(value)) { - return value; - } - if (value.length === 1 || !isDict(value[0])) { - return value[0]; - } - return Dict.merge(this.xref, value); - }, - - get content() { - return this.pageDict.get('Contents'); - }, - - get resources() { - // For robustness: The spec states that a \Resources entry has to be - // present, but can be empty. Some document omit it still, in this case - // we return an empty dictionary. - return shadow(this, 'resources', - this._getInheritableProperty('Resources') || Dict.empty); - }, - - get mediaBox() { - var mediaBox = this._getInheritableProperty('MediaBox', + if (!Array.isArray(value)) { + return value; + } + if (value.length === 1 || !isDict(value[0])) { + return value[0]; + } + return Dict.merge(this.xref, value); + } + + get content() { + return this.pageDict.get('Contents'); + } + + get resources() { + // For robustness: The spec states that a \Resources entry has to be + // present, but can be empty. Some documents still omit it; in this case + // we return an empty dictionary. + return shadow(this, 'resources', + this._getInheritableProperty('Resources') || Dict.empty); + } + + get mediaBox() { + const mediaBox = this._getInheritableProperty('MediaBox', /* getArray = */ true); - // Reset invalid media box to letter size. - if (!Array.isArray(mediaBox) || mediaBox.length !== 4) { - return shadow(this, 'mediaBox', LETTER_SIZE_MEDIABOX); - } - return shadow(this, 'mediaBox', mediaBox); - }, + // Reset invalid media box to letter size. + if (!Array.isArray(mediaBox) || mediaBox.length !== 4) { + return shadow(this, 'mediaBox', LETTER_SIZE_MEDIABOX); + } + return shadow(this, 'mediaBox', mediaBox); + } - get cropBox() { - var cropBox = this._getInheritableProperty('CropBox', + get cropBox() { + const cropBox = this._getInheritableProperty('CropBox', /* getArray = */ true); - // Reset invalid crop box to media box. - if (!Array.isArray(cropBox) || cropBox.length !== 4) { - return shadow(this, 'cropBox', this.mediaBox); - } - return shadow(this, 'cropBox', cropBox); - }, + // Reset invalid crop box to media box. + if (!Array.isArray(cropBox) || cropBox.length !== 4) { + return shadow(this, 'cropBox', this.mediaBox); + } + return shadow(this, 'cropBox', cropBox); + } - get userUnit() { - var obj = this.pageDict.get('UserUnit'); - if (!isNum(obj) || obj <= 0) { - obj = DEFAULT_USER_UNIT; - } - return shadow(this, 'userUnit', obj); - }, - - get view() { - // From the spec, 6th ed., p.963: - // "The crop, bleed, trim, and art boxes should not ordinarily - // extend beyond the boundaries of the media box. If they do, they are - // effectively reduced to their intersection with the media box." - var mediaBox = this.mediaBox, cropBox = this.cropBox; - if (mediaBox === cropBox) { - return shadow(this, 'view', mediaBox); - } - var intersection = Util.intersect(cropBox, mediaBox); - return shadow(this, 'view', intersection || mediaBox); - }, - - get rotate() { - var rotate = this._getInheritableProperty('Rotate') || 0; - // Normalize rotation so it's a multiple of 90 and between 0 and 270 - if (rotate % 90 !== 0) { - rotate = 0; - } else if (rotate >= 360) { - rotate = rotate % 360; - } else if (rotate < 0) { - // The spec doesn't cover negatives, assume its counterclockwise - // rotation. The following is the other implementation of modulo. - rotate = ((rotate % 360) + 360) % 360; + get userUnit() { + let obj = this.pageDict.get('UserUnit'); + if (!isNum(obj) || obj <= 0) { + obj = DEFAULT_USER_UNIT; + } + return shadow(this, 'userUnit', obj); + } + + get view() { + // From the spec, 6th ed., p.963: + // "The crop, bleed, trim, and art boxes should not ordinarily + // extend beyond the boundaries of the media box. If they do, they are + // effectively reduced to their intersection with the media box." + const mediaBox = this.mediaBox, cropBox = this.cropBox; + if (mediaBox === cropBox) { + return shadow(this, 'view', mediaBox); + } + + const intersection = Util.intersect(cropBox, mediaBox); + return shadow(this, 'view', intersection || mediaBox); + } + + get rotate() { + let rotate = this._getInheritableProperty('Rotate') || 0; + + // Normalize rotation so it's a multiple of 90 and between 0 and 270. + if (rotate % 90 !== 0) { + rotate = 0; + } else if (rotate >= 360) { + rotate = rotate % 360; + } else if (rotate < 0) { + // The spec doesn't cover negatives. Assume it's counterclockwise + // rotation. The following is the other implementation of modulo. + rotate = ((rotate % 360) + 360) % 360; + } + return shadow(this, 'rotate', rotate); + } + + getContentStream() { + const content = this.content; + let stream; + + if (Array.isArray(content)) { + // Fetching the individual streams from the array. + const xref = this.xref; + const streams = []; + for (const stream of content) { + streams.push(xref.fetchIfRef(stream)); } - return shadow(this, 'rotate', rotate); - }, - - getContentStream: function Page_getContentStream() { - var content = this.content; - var stream; - if (Array.isArray(content)) { - // fetching items - var xref = this.xref; - var i, n = content.length; - var streams = []; - for (i = 0; i < n; ++i) { - streams.push(xref.fetchIfRef(content[i])); - } - stream = new StreamsSequenceStream(streams); - } else if (isStream(content)) { - stream = content; - } else { - // replacing non-existent page content with empty one - stream = new NullStream(); + stream = new StreamsSequenceStream(streams); + } else if (isStream(content)) { + stream = content; + } else { + // Replace non-existent page content with empty content. + stream = new NullStream(); + } + return stream; + } + + loadResources(keys) { + if (!this.resourcesPromise) { + // TODO: add async `_getInheritableProperty` and remove this. + this.resourcesPromise = this.pdfManager.ensure(this, 'resources'); + } + return this.resourcesPromise.then(() => { + const objectLoader = new ObjectLoader(this.resources, keys, this.xref); + return objectLoader.load(); + }); + } + + getOperatorList({ handler, task, intent, renderInteractiveForms, }) { + const contentStreamPromise = this.pdfManager.ensure(this, + 'getContentStream'); + const resourcesPromise = this.loadResources([ + 'ExtGState', + 'ColorSpace', + 'Pattern', + 'Shading', + 'XObject', + 'Font', + ]); + + const partialEvaluator = new PartialEvaluator({ + pdfManager: this.pdfManager, + xref: this.xref, + handler, + pageIndex: this.pageIndex, + idFactory: this.idFactory, + fontCache: this.fontCache, + builtInCMapCache: this.builtInCMapCache, + options: this.evaluatorOptions, + pdfFunctionFactory: this.pdfFunctionFactory, + }); + + const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]); + const pageListPromise = dataPromises.then(([contentStream]) => { + const opList = new OperatorList(intent, handler, this.pageIndex); + + handler.send('StartRenderPage', { + transparency: partialEvaluator.hasBlendModes(this.resources), + pageIndex: this.pageIndex, + intent, + }); + + return partialEvaluator.getOperatorList({ + stream: contentStream, + task, + resources: this.resources, + operatorList: opList, + }).then(function() { + return opList; + }); + }); + + // Fetch the page's annotations and add their operator lists to the + // page's operator list to render them. + return Promise.all([pageListPromise, this._parsedAnnotations]).then( + function([pageOpList, annotations]) { + if (annotations.length === 0) { + pageOpList.flush(true); + return pageOpList; } - return stream; - }, - loadResources: function Page_loadResources(keys) { - if (!this.resourcesPromise) { - // TODO: add async `_getInheritableProperty` and remove this. - this.resourcesPromise = this.pdfManager.ensure(this, 'resources'); + // Collect the operator list promises for the annotations. Each promise + // is resolved with the complete operator list for a single annotation. + const opListPromises = []; + for (const annotation of annotations) { + if (isAnnotationRenderable(annotation, intent)) { + opListPromises.push(annotation.getOperatorList( + partialEvaluator, task, renderInteractiveForms)); + } } - return this.resourcesPromise.then(() => { - let objectLoader = new ObjectLoader(this.resources, keys, this.xref); - return objectLoader.load(); + return Promise.all(opListPromises).then(function(opLists) { + pageOpList.addOp(OPS.beginAnnotations, []); + for (const opList of opLists) { + pageOpList.addOpList(opList); + } + pageOpList.addOp(OPS.endAnnotations, []); + pageOpList.flush(true); + return pageOpList; }); - }, + }); + } - getOperatorList({ handler, task, intent, renderInteractiveForms, }) { - var contentStreamPromise = this.pdfManager.ensure(this, + extractTextContent({ handler, task, normalizeWhitespace, sink, + combineTextItems, }) { + const contentStreamPromise = this.pdfManager.ensure(this, 'getContentStream'); - var resourcesPromise = this.loadResources([ - 'ExtGState', - 'ColorSpace', - 'Pattern', - 'Shading', - 'XObject', - 'Font' - // ProcSet - // Properties - ]); - - var partialEvaluator = new PartialEvaluator({ + const resourcesPromise = this.loadResources([ + 'ExtGState', + 'XObject', + 'Font', + ]); + + const dataPromises = Promise.all([contentStreamPromise, resourcesPromise]); + return dataPromises.then(([contentStream]) => { + const partialEvaluator = new PartialEvaluator({ pdfManager: this.pdfManager, xref: this.xref, handler, @@ -205,150 +278,93 @@ var Page = (function PageClosure() { pdfFunctionFactory: this.pdfFunctionFactory, }); - var dataPromises = Promise.all([contentStreamPromise, resourcesPromise]); - var pageListPromise = dataPromises.then(([contentStream]) => { - var opList = new OperatorList(intent, handler, this.pageIndex); - - handler.send('StartRenderPage', { - transparency: partialEvaluator.hasBlendModes(this.resources), - pageIndex: this.pageIndex, - intent, - }); - return partialEvaluator.getOperatorList({ - stream: contentStream, - task, - resources: this.resources, - operatorList: opList, - }).then(function () { - return opList; - }); + return partialEvaluator.getTextContent({ + stream: contentStream, + task, + resources: this.resources, + normalizeWhitespace, + combineTextItems, + sink, }); + }); + } - // Fetch the page's annotations and add their operator lists to the - // page's operator list to render them. - return Promise.all([pageListPromise, this._parsedAnnotations]).then( - function ([pageOpList, annotations]) { - if (annotations.length === 0) { - pageOpList.flush(true); - return pageOpList; + getAnnotationsData(intent) { + return this._parsedAnnotations.then(function(annotations) { + const annotationsData = []; + for (let i = 0, ii = annotations.length; i < ii; i++) { + if (!intent || isAnnotationRenderable(annotations[i], intent)) { + annotationsData.push(annotations[i].data); } + } + return annotationsData; + }); + } - // Collect the operator list promises for the annotations. Each promise - // is resolved with the complete operator list for a single annotation. - var i, ii, opListPromises = []; - for (i = 0, ii = annotations.length; i < ii; i++) { - if (isAnnotationRenderable(annotations[i], intent)) { - opListPromises.push(annotations[i].getOperatorList( - partialEvaluator, task, renderInteractiveForms)); - } - } + get annotations() { + return shadow(this, 'annotations', + this._getInheritableProperty('Annots') || []); + } - return Promise.all(opListPromises).then(function(opLists) { - pageOpList.addOp(OPS.beginAnnotations, []); - for (i = 0, ii = opLists.length; i < ii; i++) { - pageOpList.addOpList(opLists[i]); - } - pageOpList.addOp(OPS.endAnnotations, []); + get _parsedAnnotations() { + const parsedAnnotations = + this.pdfManager.ensure(this, 'annotations').then(() => { + const annotationRefs = this.annotations; + const annotationPromises = []; + for (let i = 0, ii = annotationRefs.length; i < ii; i++) { + annotationPromises.push(AnnotationFactory.create( + this.xref, annotationRefs[i], this.pdfManager, this.idFactory)); + } - pageOpList.flush(true); - return pageOpList; + return Promise.all(annotationPromises).then(function(annotations) { + return annotations.filter(function isDefined(annotation) { + return !!annotation; + }); + }, function(reason) { + warn(`_parsedAnnotations: "${reason}".`); + return []; }); }); - }, - extractTextContent({ handler, task, normalizeWhitespace, - sink, combineTextItems, }) { - var contentStreamPromise = this.pdfManager.ensure(this, - 'getContentStream'); - var resourcesPromise = this.loadResources([ - 'ExtGState', - 'XObject', - 'Font' - ]); - - var dataPromises = Promise.all([contentStreamPromise, resourcesPromise]); - return dataPromises.then(([contentStream]) => { - var partialEvaluator = new PartialEvaluator({ - pdfManager: this.pdfManager, - xref: this.xref, - handler, - pageIndex: this.pageIndex, - idFactory: this.idFactory, - fontCache: this.fontCache, - builtInCMapCache: this.builtInCMapCache, - options: this.evaluatorOptions, - pdfFunctionFactory: this.pdfFunctionFactory, - }); + return shadow(this, '_parsedAnnotations', parsedAnnotations); + } +} - return partialEvaluator.getTextContent({ - stream: contentStream, - task, - resources: this.resources, - normalizeWhitespace, - combineTextItems, - sink, - }); - }); - }, - - getAnnotationsData(intent) { - return this._parsedAnnotations.then(function(annotations) { - let annotationsData = []; - for (let i = 0, ii = annotations.length; i < ii; i++) { - if (!intent || isAnnotationRenderable(annotations[i], intent)) { - annotationsData.push(annotations[i].data); - } - } - return annotationsData; - }); - }, - - get annotations() { - return shadow(this, 'annotations', - this._getInheritableProperty('Annots') || []); - }, - - get _parsedAnnotations() { - const parsedAnnotations = - this.pdfManager.ensure(this, 'annotations').then(() => { - const annotationRefs = this.annotations; - const annotationPromises = []; - for (let i = 0, ii = annotationRefs.length; i < ii; i++) { - annotationPromises.push(AnnotationFactory.create( - this.xref, annotationRefs[i], this.pdfManager, this.idFactory)); - } +const FINGERPRINT_FIRST_BYTES = 1024; +const EMPTY_FINGERPRINT = '\x00\x00\x00\x00\x00\x00\x00' + + '\x00\x00\x00\x00\x00\x00\x00\x00\x00'; - return Promise.all(annotationPromises).then(function(annotations) { - return annotations.filter(function isDefined(annotation) { - return !!annotation; - }); - }, function(reason) { - warn(`_parsedAnnotations: "${reason}".`); - return []; - }); - }); +function find(stream, needle, limit, backwards) { + const pos = stream.pos; + const end = stream.end; + if (pos + limit > end) { + limit = end - pos; + } - return shadow(this, '_parsedAnnotations', parsedAnnotations); - }, - }; + const strBuf = []; + for (let i = 0; i < limit; ++i) { + strBuf.push(String.fromCharCode(stream.getByte())); + } + const str = strBuf.join(''); - return Page; -})(); + stream.pos = pos; + const index = backwards ? str.lastIndexOf(needle) : str.indexOf(needle); + if (index === -1) { + return false; + } + stream.pos += index; + return true; +} /** - * The `PDFDocument` holds all the data of the PDF file. Compared to the - * `PDFDoc`, this one doesn't have any job management code. - * Right now there exists one PDFDocument on the main thread + one object - * for each worker. If there is no worker support enabled, there are two - * `PDFDocument` objects on the main thread created. + * The `PDFDocument` class holds all the data of the PDF file. There exists + * one `PDFDocument` object on the main thread and one object for each worker. + * If no worker support is enabled, two `PDFDocument` objects are created on + * the main thread. */ -var PDFDocument = (function PDFDocumentClosure() { - var FINGERPRINT_FIRST_BYTES = 1024; - var EMPTY_FINGERPRINT = '\x00\x00\x00\x00\x00\x00\x00' + - '\x00\x00\x00\x00\x00\x00\x00\x00\x00'; - - function PDFDocument(pdfManager, arg) { - var stream; +class PDFDocument { + constructor(pdfManager, arg) { + let stream; if (isStream(arg)) { stream = arg; } else if (isArrayBuffer(arg)) { @@ -357,319 +373,303 @@ var PDFDocument = (function PDFDocumentClosure() { throw new Error('PDFDocument: Unknown argument type'); } if (stream.length <= 0) { - throw new Error('PDFDocument: stream must have data'); + throw new Error('PDFDocument: Stream must have data'); } this.pdfManager = pdfManager; this.stream = stream; this.xref = new XRef(stream, pdfManager); - let evaluatorOptions = pdfManager.evaluatorOptions; this.pdfFunctionFactory = new PDFFunctionFactory({ xref: this.xref, - isEvalSupported: evaluatorOptions.isEvalSupported, + isEvalSupported: pdfManager.evaluatorOptions.isEvalSupported, }); this._pagePromises = []; } - function find(stream, needle, limit, backwards) { - var pos = stream.pos; - var end = stream.end; - var strBuf = []; - if (pos + limit > end) { - limit = end - pos; - } - for (var n = 0; n < limit; ++n) { - strBuf.push(String.fromCharCode(stream.getByte())); - } - var str = strBuf.join(''); - stream.pos = pos; - var index = backwards ? str.lastIndexOf(needle) : str.indexOf(needle); - if (index === -1) { - return false; /* not found */ + parse(recoveryMode) { + this.setup(recoveryMode); + + const version = this.catalog.catDict.get('Version'); + if (isName(version)) { + this.pdfFormatVersion = version.name; } - stream.pos += index; - return true; /* found */ - } - - const DocumentInfoValidators = { - Title: isString, - Author: isString, - Subject: isString, - Keywords: isString, - Creator: isString, - Producer: isString, - CreationDate: isString, - ModDate: isString, - Trapped: isName, - }; - - PDFDocument.prototype = { - parse: function PDFDocument_parse(recoveryMode) { - this.setup(recoveryMode); - var version = this.catalog.catDict.get('Version'); - if (isName(version)) { - this.pdfFormatVersion = version.name; - } - try { - // checking if AcroForm is present - this.acroForm = this.catalog.catDict.get('AcroForm'); - if (this.acroForm) { - this.xfa = this.acroForm.get('XFA'); - var fields = this.acroForm.get('Fields'); - if ((!fields || !Array.isArray(fields) || fields.length === 0) && - !this.xfa) { - // no fields and no XFA -- not a form (?) - this.acroForm = null; - } - } - } catch (ex) { - if (ex instanceof MissingDataException) { - throw ex; + + // Check if AcroForms are present in the document. + try { + this.acroForm = this.catalog.catDict.get('AcroForm'); + if (this.acroForm) { + this.xfa = this.acroForm.get('XFA'); + const fields = this.acroForm.get('Fields'); + if ((!fields || !Array.isArray(fields) || fields.length === 0) && + !this.xfa) { + this.acroForm = null; // No fields and no XFA, so it's not a form. } - info('Something wrong with AcroForm entry'); - this.acroForm = null; } - }, - - get linearization() { - let linearization = null; - try { - linearization = Linearization.create(this.stream); - } catch (err) { - if (err instanceof MissingDataException) { - throw err; - } - info(err); + } catch (ex) { + if (ex instanceof MissingDataException) { + throw ex; } - // shadow the prototype getter with a data property - return shadow(this, 'linearization', linearization); - }, - get startXRef() { - var stream = this.stream; - var startXRef = 0; - var linearization = this.linearization; - if (linearization) { - // Find end of first obj. - stream.reset(); - if (find(stream, 'endobj', 1024)) { - startXRef = stream.pos + 6; - } - } else { - // Find startxref by jumping backward from the end of the file. - var step = 1024; - var found = false, pos = stream.end; - while (!found && pos > 0) { - pos -= step - 'startxref'.length; - if (pos < 0) { - pos = 0; - } - stream.pos = pos; - found = find(stream, 'startxref', step, true); - } - if (found) { - stream.skip(9); - var ch; - do { - ch = stream.getByte(); - } while (isSpace(ch)); - var str = ''; - while (ch >= 0x20 && ch <= 0x39) { // < '9' - str += String.fromCharCode(ch); - ch = stream.getByte(); - } - startXRef = parseInt(str, 10); - if (isNaN(startXRef)) { - startXRef = 0; - } - } + info('Cannot fetch AcroForm entry; assuming no AcroForms are present'); + this.acroForm = null; + } + } + + get linearization() { + let linearization = null; + try { + linearization = Linearization.create(this.stream); + } catch (err) { + if (err instanceof MissingDataException) { + throw err; } - // shadow the prototype getter with a data property - return shadow(this, 'startXRef', startXRef); - }, - - // Find the header, remove leading garbage and setup the stream - // starting from the header. - checkHeader: function PDFDocument_checkHeader() { - var stream = this.stream; + info(err); + } + return shadow(this, 'linearization', linearization); + } + + get startXRef() { + const stream = this.stream; + let startXRef = 0; + + if (this.linearization) { + // Find the end of the first object. stream.reset(); - if (find(stream, '%PDF-', 1024)) { - // Found the header, trim off any garbage before it. - stream.moveStart(); - // Reading file format version - var MAX_VERSION_LENGTH = 12; - var version = '', ch; - while ((ch = stream.getByte()) > 0x20) { // SPACE - if (version.length >= MAX_VERSION_LENGTH) { - break; - } - version += String.fromCharCode(ch); - } - if (!this.pdfFormatVersion) { - // removing "%PDF-"-prefix - this.pdfFormatVersion = version.substring(5); - } - return; + if (find(stream, 'endobj', 1024)) { + startXRef = stream.pos + 6; } - // May not be a PDF file, continue anyway. - }, - parseStartXRef: function PDFDocument_parseStartXRef() { - var startXRef = this.startXRef; - this.xref.setStartXRef(startXRef); - }, - setup: function PDFDocument_setup(recoveryMode) { - this.xref.parse(recoveryMode); - this.catalog = new Catalog(this.pdfManager, this.xref); - }, - get numPages() { - var linearization = this.linearization; - var num = linearization ? linearization.numPages : this.catalog.numPages; - // shadow the prototype getter - return shadow(this, 'numPages', num); - }, - get documentInfo() { - const docInfo = { - PDFFormatVersion: this.pdfFormatVersion, - IsLinearized: !!this.linearization, - IsAcroFormPresent: !!this.acroForm, - IsXFAPresent: !!this.xfa, - }; - let infoDict; - try { - infoDict = this.xref.trailer.get('Info'); - } catch (err) { - if (err instanceof MissingDataException) { - throw err; + } else { + // Find `startxref` by checking backwards from the end of the file. + const step = 1024; + const startXRefLength = 'startxref'.length; + let found = false, pos = stream.end; + + while (!found && pos > 0) { + pos -= step - startXRefLength; + if (pos < 0) { + pos = 0; } - info('The document information dictionary is invalid.'); + stream.pos = pos; + found = find(stream, 'startxref', step, true); } - if (isDict(infoDict)) { - // Fill the document info with valid entries from the specification, - // as well as any existing well-formed custom entries. - for (let key of infoDict.getKeys()) { - const value = infoDict.get(key); - - if (DocumentInfoValidators[key]) { - // Make sure the (standard) value conforms to the specification. - if (DocumentInfoValidators[key](value)) { - docInfo[key] = (typeof value !== 'string' ? - value : stringToPDFString(value)); - } else { - info(`Bad value in document info for "${key}".`); - } - } else if (typeof key === 'string') { - // For custom values, only accept white-listed types to prevent - // errors that would occur when trying to send non-serializable - // objects to the main-thread (for example `Dict` or `Stream`). - let customValue; - if (isString(value)) { - customValue = stringToPDFString(value); - } else if (isName(value) || isNum(value) || isBool(value)) { - customValue = value; - } else { - info(`Unsupported value in document info for (custom) "${key}".`); - continue; - } - - if (!docInfo['Custom']) { - docInfo['Custom'] = Object.create(null); - } - docInfo['Custom'][key] = customValue; - } + + if (found) { + stream.skip(9); + let ch; + do { + ch = stream.getByte(); + } while (isSpace(ch)); + let str = ''; + while (ch >= 0x20 && ch <= 0x39) { // < '9' + str += String.fromCharCode(ch); + ch = stream.getByte(); } - } - return shadow(this, 'documentInfo', docInfo); - }, - get fingerprint() { - var xref = this.xref, hash, fileID = ''; - var idArray = xref.trailer.get('ID'); - - if (Array.isArray(idArray) && idArray[0] && isString(idArray[0]) && - idArray[0] !== EMPTY_FINGERPRINT) { - hash = stringToBytes(idArray[0]); - } else { - if (this.stream.ensureRange) { - this.stream.ensureRange(0, - Math.min(FINGERPRINT_FIRST_BYTES, this.stream.end)); + startXRef = parseInt(str, 10); + if (isNaN(startXRef)) { + startXRef = 0; } - hash = calculateMD5(this.stream.bytes.subarray(0, - FINGERPRINT_FIRST_BYTES), 0, FINGERPRINT_FIRST_BYTES); } + } + return shadow(this, 'startXRef', startXRef); + } + + // Find the header, get the PDF format version and setup the + // stream to start from the header. + checkHeader() { + const stream = this.stream; + stream.reset(); - for (var i = 0, n = hash.length; i < n; i++) { - var hex = hash[i].toString(16); - fileID += hex.length === 1 ? '0' + hex : hex; + if (!find(stream, '%PDF-', 1024)) { + // May not be a PDF file, but don't throw an error and let + // parsing continue. + return; + } + stream.moveStart(); + + // Read the PDF format version. + const MAX_PDF_VERSION_LENGTH = 12; + let version = '', ch; + while ((ch = stream.getByte()) > 0x20) { // Space + if (version.length >= MAX_PDF_VERSION_LENGTH) { + break; } + version += String.fromCharCode(ch); + } + if (!this.pdfFormatVersion) { + // Remove the "%PDF-" prefix. + this.pdfFormatVersion = version.substring(5); + } + } - return shadow(this, 'fingerprint', fileID); - }, + parseStartXRef() { + this.xref.setStartXRef(this.startXRef); + } - _getLinearizationPage(pageIndex) { - const { catalog, linearization, } = this; - assert(linearization && linearization.pageFirst === pageIndex); + setup(recoveryMode) { + this.xref.parse(recoveryMode); + this.catalog = new Catalog(this.pdfManager, this.xref); + } - const ref = new Ref(linearization.objectNumberFirst, 0); - return this.xref.fetchAsync(ref).then((obj) => { - // Ensure that the object that was found is actually a Page dictionary. - if (isDict(obj, 'Page') || - (isDict(obj) && !obj.has('Type') && obj.has('Contents'))) { - if (ref && !catalog.pageKidsCountCache.has(ref)) { - catalog.pageKidsCountCache.put(ref, 1); // Cache the Page reference. + get numPages() { + const linearization = this.linearization; + const num = linearization ? linearization.numPages : this.catalog.numPages; + return shadow(this, 'numPages', num); + } + + get documentInfo() { + const DocumentInfoValidators = { + Title: isString, + Author: isString, + Subject: isString, + Keywords: isString, + Creator: isString, + Producer: isString, + CreationDate: isString, + ModDate: isString, + Trapped: isName, + }; + + const docInfo = { + PDFFormatVersion: this.pdfFormatVersion, + IsLinearized: !!this.linearization, + IsAcroFormPresent: !!this.acroForm, + IsXFAPresent: !!this.xfa, + }; + + let infoDict; + try { + infoDict = this.xref.trailer.get('Info'); + } catch (err) { + if (err instanceof MissingDataException) { + throw err; + } + info('The document information dictionary is invalid.'); + } + + if (isDict(infoDict)) { + // Fill the document info with valid entries from the specification, + // as well as any existing well-formed custom entries. + for (const key of infoDict.getKeys()) { + const value = infoDict.get(key); + + if (DocumentInfoValidators[key]) { + // Make sure the (standard) value conforms to the specification. + if (DocumentInfoValidators[key](value)) { + docInfo[key] = (typeof value !== 'string' ? + value : stringToPDFString(value)); + } else { + info(`Bad value in document info for "${key}".`); + } + } else if (typeof key === 'string') { + // For custom values, only accept white-listed types to prevent + // errors that would occur when trying to send non-serializable + // objects to the main-thread (for example `Dict` or `Stream`). + let customValue; + if (isString(value)) { + customValue = stringToPDFString(value); + } else if (isName(value) || isNum(value) || isBool(value)) { + customValue = value; + } else { + info(`Unsupported value in document info for (custom) "${key}".`); + continue; + } + + if (!docInfo['Custom']) { + docInfo['Custom'] = Object.create(null); } - return [obj, ref]; + docInfo['Custom'][key] = customValue; } - throw new FormatError('The Linearization dictionary doesn\'t point ' + - 'to a valid Page dictionary.'); - }).catch((reason) => { - info(reason); - return catalog.getPageDict(pageIndex); - }); - }, + } + } + return shadow(this, 'documentInfo', docInfo); + } - getPage(pageIndex) { - if (this._pagePromises[pageIndex] !== undefined) { - return this._pagePromises[pageIndex]; + get fingerprint() { + let hash; + const idArray = this.xref.trailer.get('ID'); + if (Array.isArray(idArray) && idArray[0] && isString(idArray[0]) && + idArray[0] !== EMPTY_FINGERPRINT) { + hash = stringToBytes(idArray[0]); + } else { + if (this.stream.ensureRange) { + this.stream.ensureRange(0, + Math.min(FINGERPRINT_FIRST_BYTES, this.stream.end)); } - const { catalog, linearization, } = this; - - const promise = (linearization && linearization.pageFirst === pageIndex) ? - this._getLinearizationPage(pageIndex) : catalog.getPageDict(pageIndex); - - return this._pagePromises[pageIndex] = promise.then(([pageDict, ref]) => { - return new Page({ - pdfManager: this.pdfManager, - xref: this.xref, - pageIndex, - pageDict, - ref, - fontCache: catalog.fontCache, - builtInCMapCache: catalog.builtInCMapCache, - pdfFunctionFactory: this.pdfFunctionFactory, - }); - }); - }, - - checkFirstPage() { - return this.getPage(0).catch((reason) => { - if (reason instanceof XRefEntryException) { - // Clear out the various caches to ensure that we haven't stored any - // inconsistent and/or incorrect state, since that could easily break - // subsequent `this.getPage` calls. - this._pagePromises.length = 0; - this.cleanup(); - - throw new XRefParseException(); + hash = calculateMD5(this.stream.bytes.subarray(0, + FINGERPRINT_FIRST_BYTES), 0, FINGERPRINT_FIRST_BYTES); + } + + let fingerprint = ''; + for (const hashPart of hash) { + const hex = hashPart.toString(16); + fingerprint += (hex.length === 1 ? '0' + hex : hex); + } + return shadow(this, 'fingerprint', fingerprint); + } + + _getLinearizationPage(pageIndex) { + const { catalog, linearization, } = this; + assert(linearization && linearization.pageFirst === pageIndex); + + const ref = new Ref(linearization.objectNumberFirst, 0); + return this.xref.fetchAsync(ref).then((obj) => { + // Ensure that the object that was found is actually a Page dictionary. + if (isDict(obj, 'Page') || + (isDict(obj) && !obj.has('Type') && obj.has('Contents'))) { + if (ref && !catalog.pageKidsCountCache.has(ref)) { + catalog.pageKidsCountCache.put(ref, 1); // Cache the Page reference. } + return [obj, ref]; + } + throw new FormatError('The Linearization dictionary doesn\'t point ' + + 'to a valid Page dictionary.'); + }).catch((reason) => { + info(reason); + return catalog.getPageDict(pageIndex); + }); + } + + getPage(pageIndex) { + if (this._pagePromises[pageIndex] !== undefined) { + return this._pagePromises[pageIndex]; + } + const { catalog, linearization, } = this; + + const promise = (linearization && linearization.pageFirst === pageIndex) ? + this._getLinearizationPage(pageIndex) : catalog.getPageDict(pageIndex); + + return this._pagePromises[pageIndex] = promise.then(([pageDict, ref]) => { + return new Page({ + pdfManager: this.pdfManager, + xref: this.xref, + pageIndex, + pageDict, + ref, + fontCache: catalog.fontCache, + builtInCMapCache: catalog.builtInCMapCache, + pdfFunctionFactory: this.pdfFunctionFactory, }); - }, + }); + } - cleanup: function PDFDocument_cleanup() { - return this.catalog.cleanup(); - }, - }; + checkFirstPage() { + return this.getPage(0).catch((reason) => { + if (reason instanceof XRefEntryException) { + // Clear out the various caches to ensure that we haven't stored any + // inconsistent and/or incorrect state, since that could easily break + // subsequent `this.getPage` calls. + this._pagePromises.length = 0; + this.cleanup(); - return PDFDocument; -})(); + throw new XRefParseException(); + } + }); + } + + cleanup() { + return this.catalog.cleanup(); + } +} export { Page,