From 477061283ab1111c2ee0d65b40f090fa10db7867 Mon Sep 17 00:00:00 2001 From: Robert Knight Date: Tue, 24 Aug 2021 10:54:35 +0100 Subject: [PATCH] Fail anchoring in PDFs if there is no quote selector Changes in text rendering across PDF.js versions can render position selectors invalid. Therefore any anchoring done with position selectors must be checked against the quote, as we do with HTML annotations. This commit disallows anchoring using only position selectors in PDFs and restructures `anchor` control flow using async/await to make it easier to follow. We have been capturing quote selectors with PDF annotations forever, so there should be no impact on old annotations. --- src/annotator/anchoring/pdf.js | 76 ++++++++++++++---------- src/annotator/anchoring/test/pdf-test.js | 17 +++++- 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/src/annotator/anchoring/pdf.js b/src/annotator/anchoring/pdf.js index 629e327dbba..b867a427d5a 100644 --- a/src/annotator/anchoring/pdf.js +++ b/src/annotator/anchoring/pdf.js @@ -446,57 +446,71 @@ function prioritizePages(position) { /** * Anchor a set of selectors to a DOM Range. * + * `selectors` must include a `TextQuoteSelector` and may include other selector + * types. + * * @param {HTMLElement} root - * @param {Selector[]} selectors - Selector objects to anchor + * @param {Selector[]} selectors * @return {Promise} */ -export function anchor(root, selectors) { - const position = /** @type {TextPositionSelector|undefined} */ ( - selectors.find(s => s.type === 'TextPositionSelector') - ); +export async function anchor(root, selectors) { const quote = /** @type {TextQuoteSelector|undefined} */ ( selectors.find(s => s.type === 'TextQuoteSelector') ); - /** @type {Promise} */ - let result = Promise.reject('unable to anchor'); + // The quote selector is required in order to check that text position + // selector results are still valid. + if (!quote) { + throw new Error('No quote selector found'); + } + + const position = /** @type {TextPositionSelector|undefined} */ ( + selectors.find(s => s.type === 'TextPositionSelector') + ); if (position) { - result = result.catch(() => { - return findPage(position.start).then(({ index, offset, textContent }) => { - const start = position.start - offset; - const end = position.end - offset; - const length = end - start; - - const matchedText = textContent.substr(start, length); - if (quote && quote.exact !== matchedText) { - throw new Error('quote mismatch'); - } + // If we have a position selector, try using that first as it is the fastest + // anchoring method. + try { + const { index, offset, textContent } = await findPage(position.start); + const start = position.start - offset; + const end = position.end - offset; + const length = end - start; + + const matchedText = textContent.substr(start, length); + if (quote.exact !== matchedText) { + throw new Error('quote mismatch'); + } - return anchorByPosition(index, start, end); - }); - }); - } + const range = await anchorByPosition(index, start, end); + return range; + } catch { + // Fall back to quote selector + } - if (quote) { - result = result.catch(() => { + // If anchoring with the position failed, check for a cached quote-based + // match using the quote + position as a cache key. + try { if ( - position && quotePositionCache[quote.exact] && quotePositionCache[quote.exact][position.start] ) { const { pageIndex, anchor } = quotePositionCache[quote.exact][position.start]; - return anchorByPosition(pageIndex, anchor.start, anchor.end); + const range = await anchorByPosition( + pageIndex, + anchor.start, + anchor.end + ); + return range; } - - return prioritizePages(position?.start ?? 0).then(pageIndices => { - return findInPages(pageIndices, quote, position); - }); - }); + } catch { + // Fall back to uncached quote selector match + } } - return result; + const pageIndices = await prioritizePages(position?.start ?? 0); + return findInPages(pageIndices, quote, position); } /** diff --git a/src/annotator/anchoring/test/pdf-test.js b/src/annotator/anchoring/test/pdf-test.js index 2e52fae3b6d..dd9ff850b19 100644 --- a/src/annotator/anchoring/test/pdf-test.js +++ b/src/annotator/anchoring/test/pdf-test.js @@ -245,7 +245,7 @@ describe('annotator/anchoring/pdf', () => { // Test that all of the selectors anchor and that each selector individually // anchors correctly as well - const subsets = [[position, quote], [position], [quote]]; + const subsets = [[position, quote], [quote]]; const subsetsAnchored = subsets.map(subset => { const types = subset.map(s => { return s.type; @@ -270,6 +270,21 @@ describe('annotator/anchoring/pdf', () => { }); }); + [[], [{ type: 'TextPositionSelector', start: 0, end: 200 }]].forEach( + selectors => { + it('fails to anchor if there is no quote selector', async () => { + let error; + try { + await pdfAnchoring.anchor(container, selectors); + } catch (err) { + error = err; + } + assert.instanceOf(error, Error); + assert.equal(error.message, 'No quote selector found'); + }); + } + ); + it('anchors text in older PDF.js versions', async () => { initViewer(fixtures.pdfPages, { newTextRendering: false });