Skip to content

Commit

Permalink
Replace document.title as a fallback for title in PDFs
Browse files Browse the repository at this point in the history
Replace the usage of `document.title` as a way to get the document title
if the PDF has no embedded title in either its _document info
dictionary_ or _metadata stream_.

In top-level frames using `document.title` (where `document` is the
global HTML document, not the PDF) works because PDF.js sets the title
based on the first non-empty value from:

 1. The embedded title
 2. The filename from the `Content-Disposition` header
 3. The last segment of the URL's path (eg. "test.pdf" in
    "https://example.com/test.pdf")

When PDF.js is embedded in an iframe however, it does not set
`document.title` by default. As a result, documents were ending up in
Hypothesis with a generic "PDF.js viewer" title.

This commit implements (roughly) the same logic that PDF.js uses to
determine the value used to set `document.title`, in the case where the
PDF has no embedded title. This means implementing steps (2) and (3)
from the above list. The `Content-Disposition` filename is not exposed
as a public property on `PDFViewerApplication`, so
`PDFMetadata#getMetadata` was refactored to call the
`pdfDocument.getMetadata` instead.

Fixes #3372
  • Loading branch information
robertknight committed May 11, 2021
1 parent 5ffeba9 commit 0ca222b
Show file tree
Hide file tree
Showing 3 changed files with 159 additions and 40 deletions.
87 changes: 60 additions & 27 deletions src/annotator/integrations/pdf-metadata.js
Original file line number Diff line number Diff line change
Expand Up @@ -129,43 +129,64 @@ export class PDFMetadata {
* If the PDF is currently loading, the returned promise resolves once loading
* is complete.
*
* This method prefers metadata embedded in the PDF if available, with fallbacks
* otherwise. See the "Metadata" chapter in the PDF specification [1] for details of
* embedded metadata.
*
* [1] https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
*
* @return {Promise<Metadata>}
*/
getMetadata() {
return this._loaded.then(app => {
let title = document.title;

if (
app.metadata &&
app.metadata.has('dc:title') &&
app.metadata.get('dc:title') !== 'Untitled'
) {
title = /** @type {string} */ (app.metadata.get('dc:title'));
} else if (app.documentInfo && app.documentInfo.Title) {
title = app.documentInfo.Title;
}

const link = [{ href: fingerprintToURN(app.pdfDocument.fingerprint) }];

const url = getPDFURL(app);
if (url) {
link.push({ href: url });
}

return {
title: title,
link: link,
documentFingerprint: app.pdfDocument.fingerprint,
};
});
async getMetadata() {
const app = await this._loaded;
const {
info: documentInfo,
contentDispositionFilename,
metadata,
} = await app.pdfDocument.getMetadata();
const documentFingerprint = app.pdfDocument.fingerprint;

const url = getPDFURL(app);

let title;
if (metadata?.has('dc:title') && metadata.get('dc:title') !== 'Untitled') {
title = /** @type {string} */ (metadata.get('dc:title'));
} else if (documentInfo?.Title) {
title = documentInfo.Title;
} else if (contentDispositionFilename) {
title = contentDispositionFilename;
} else if (url) {
title = filenameFromURL(url);
} else {
title = '';
}

const link = [{ href: fingerprintToURN(documentFingerprint) }];
if (url) {
link.push({ href: url });
}

return {
title,
link,
documentFingerprint,
};
}
}

function fingerprintToURN(fingerprint) {
return 'urn:x-pdf:' + String(fingerprint);
}

/**
* @param {PDFViewerApplication} app
* @return {string|null} - Valid URL string or `null`
*/
function getPDFURL(app) {
if (!app.url) {
return null;
}

const url = normalizeURI(app.url);

// Local file:// URLs should not be saved in document metadata.
Expand All @@ -177,3 +198,15 @@ function getPDFURL(app) {

return null;
}

/**
* Return the last component of the path part of a URL.
*
* @param {string} url - A valid URL string
* @return {string}
*/
function filenameFromURL(url) {
const parsed = new URL(url);
const pathSegments = parsed.pathname.split('/');
return pathSegments[pathSegments.length - 1];
}
76 changes: 67 additions & 9 deletions src/annotator/integrations/test/pdf-metadata-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,25 @@ class FakeMetadata {
* Fake implementation of PDF.js `window.PDFViewerApplication.pdfDocument`.
*/
class FakePDFDocumentProxy {
constructor({ fingerprint }) {
constructor({
contentDispositionFilename = null,
fingerprint,
info,
metadata = null,
}) {
this.fingerprint = fingerprint;

this._contentDispositionFilename = contentDispositionFilename;
this._info = info;
this._metadata = metadata;
}

async getMetadata() {
return {
contentDispositionFilename: this._contentDispositionFilename,
info: this._info,
metadata: this._metadata,
};
}
}

Expand Down Expand Up @@ -84,6 +101,7 @@ class FakePDFViewerApplication {
* Simulate completion of PDF document loading.
*/
finishLoading({
contentDispositionFilename,
url,
fingerprint,
metadata,
Expand All @@ -92,17 +110,18 @@ class FakePDFViewerApplication {
}) {
this.url = url;
this.downloadComplete = true;
this.documentInfo = {};

if (typeof title !== undefined) {
this.documentInfo.Title = title;
}

if (metadata) {
this.metadata = new FakeMetadata(metadata);
const info = {};
if (title) {
info.Title = title;
}

this.pdfDocument = new FakePDFDocumentProxy({ fingerprint });
this.pdfDocument = new FakePDFDocumentProxy({
contentDispositionFilename,
info,
metadata: metadata ? new FakeMetadata(metadata) : null,
fingerprint,
});

if (this.dispatchDOMEvents) {
const event = document.createEvent('Event');
Expand Down Expand Up @@ -320,5 +339,44 @@ describe('PDFMetadata', function () {

assert.equal(metadata.title, 'Some title');
});

it('gets the title from the `Content-Disposition` header', async () => {
const { pdfMetadata } = createPDFMetadata({
contentDispositionFilename: 'some-file.pdf',
url: 'http://fake.com/test.pdf',
});

const metadata = await pdfMetadata.getMetadata();

assert.equal(metadata.title, 'some-file.pdf');
});

it('gets the title from the URL', async () => {
const { pdfMetadata } = createPDFMetadata({
url: 'http://fake.com/a-file.pdf',
});

const metadata = await pdfMetadata.getMetadata();

assert.equal(metadata.title, 'a-file.pdf');
});

[
null, // Missing URL
'', // Invalid URL
'https://example.com', // Missing path
'https://example.com/', // Empty string after last `/` in path
].forEach(url => {
it('returns an empty string if there is no title metadata or filename in URL', async () => {
const { pdfMetadata } = createPDFMetadata({ url });

// Earlier versions of the client used `document.title` as a fallback,
// but we changed this. See https://github.com/hypothesis/client/issues/3372.
document.title = 'Ignore me';
const metadata = await pdfMetadata.getMetadata();

assert.equal(metadata.title, '');
});
});
});
});
36 changes: 32 additions & 4 deletions src/types/pdfjs.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,46 @@
*/

/**
* Document metadata parsed from the PDF's _metadata stream_.
*
* See `Metadata` class from `display/metadata.js` in PDF.js.
*
* @typedef Metadata
* @prop {(name: string) => string} get
* @prop {(name: string) => boolean} has
*/

/**
* @typedef PDFDocument
* @prop {string} fingerprint
* Document metadata parsed from the PDF's _document info dictionary_.
*
* See `PDFDocument#documentInfo` in PDF.js.
*
* @typedef PDFDocumentInfo
* @prop {string} [Title]
*/

/**
* @typedef PDFDocumentInfo
* @prop {string} [Title]
* An object containing metadata about the PDF. This includes information from:
*
* - The PDF's document info dictionary
* - The PDF's metadata stream
* - The HTTP headers (eg. `Content-Disposition`) sent when the PDF file was
* served
*
* See the "Metadata" section (14.3) in the PDF 1.7 reference for details of
* the _metadata stream_ and _document info dictionary_.
*
* @typedef PDFDocumentMetadata
* @prop {Metadata|null} metadata
* @prop {PDFDocumentInfo} [info]
* @prop {string|null} contentDispositionFilename - The `filename` directive from
* the `Content-Disposition` header
*/

/**
* @typedef PDFDocument
* @prop {string} fingerprint
* @prop {() => Promise<PDFDocumentMetadata>} getMetadata
*/

/**
Expand Down Expand Up @@ -93,6 +120,7 @@
* @prop {Promise<void>} [initializedPromise] -
* Promise that resolves when PDF.js is initialized. Since v2.4.456.
* See https://github.com/mozilla/pdf.js/wiki/Third-party-viewer-usage#initialization-promise.
* @prop {string} url - The URL of the loaded PDF file
*/

/**
Expand Down

0 comments on commit 0ca222b

Please sign in to comment.