Skip to content

Commit

Permalink
Fail anchoring in PDFs if there is no quote selector
Browse files Browse the repository at this point in the history
Changes in text rendering across PDF.js versions can render position
selectors invalid. Therefore any anchoring done with position selectors must
be checked against the quote, as we do with HTML annotations.

This commit disallows anchoring using only position selectors in PDFs
and restructures `anchor` control flow using async/await to make it
easier to follow.

We have been capturing quote selectors with PDF annotations forever, so
there should be no impact on old annotations.
  • Loading branch information
robertknight committed Sep 1, 2021
1 parent 1651c1c commit 4770612
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 32 deletions.
76 changes: 45 additions & 31 deletions src/annotator/anchoring/pdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -446,57 +446,71 @@ function prioritizePages(position) {
/**
* Anchor a set of selectors to a DOM Range.
*
* `selectors` must include a `TextQuoteSelector` and may include other selector
* types.
*
* @param {HTMLElement} root
* @param {Selector[]} selectors - Selector objects to anchor
* @param {Selector[]} selectors
* @return {Promise<Range>}
*/
export function anchor(root, selectors) {
const position = /** @type {TextPositionSelector|undefined} */ (
selectors.find(s => s.type === 'TextPositionSelector')
);
export async function anchor(root, selectors) {
const quote = /** @type {TextQuoteSelector|undefined} */ (
selectors.find(s => s.type === 'TextQuoteSelector')
);

/** @type {Promise<Range>} */
let result = Promise.reject('unable to anchor');
// The quote selector is required in order to check that text position
// selector results are still valid.
if (!quote) {
throw new Error('No quote selector found');
}

const position = /** @type {TextPositionSelector|undefined} */ (
selectors.find(s => s.type === 'TextPositionSelector')
);

if (position) {
result = result.catch(() => {
return findPage(position.start).then(({ index, offset, textContent }) => {
const start = position.start - offset;
const end = position.end - offset;
const length = end - start;

const matchedText = textContent.substr(start, length);
if (quote && quote.exact !== matchedText) {
throw new Error('quote mismatch');
}
// If we have a position selector, try using that first as it is the fastest
// anchoring method.
try {
const { index, offset, textContent } = await findPage(position.start);
const start = position.start - offset;
const end = position.end - offset;
const length = end - start;

const matchedText = textContent.substr(start, length);
if (quote.exact !== matchedText) {
throw new Error('quote mismatch');
}

return anchorByPosition(index, start, end);
});
});
}
const range = await anchorByPosition(index, start, end);
return range;
} catch {
// Fall back to quote selector
}

if (quote) {
result = result.catch(() => {
// If anchoring with the position failed, check for a cached quote-based
// match using the quote + position as a cache key.
try {
if (
position &&
quotePositionCache[quote.exact] &&
quotePositionCache[quote.exact][position.start]
) {
const { pageIndex, anchor } =
quotePositionCache[quote.exact][position.start];
return anchorByPosition(pageIndex, anchor.start, anchor.end);
const range = await anchorByPosition(
pageIndex,
anchor.start,
anchor.end
);
return range;
}

return prioritizePages(position?.start ?? 0).then(pageIndices => {
return findInPages(pageIndices, quote, position);
});
});
} catch {
// Fall back to uncached quote selector match
}
}

return result;
const pageIndices = await prioritizePages(position?.start ?? 0);
return findInPages(pageIndices, quote, position);
}

/**
Expand Down
17 changes: 16 additions & 1 deletion src/annotator/anchoring/test/pdf-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ describe('annotator/anchoring/pdf', () => {

// Test that all of the selectors anchor and that each selector individually
// anchors correctly as well
const subsets = [[position, quote], [position], [quote]];
const subsets = [[position, quote], [quote]];
const subsetsAnchored = subsets.map(subset => {
const types = subset.map(s => {
return s.type;
Expand All @@ -270,6 +270,21 @@ describe('annotator/anchoring/pdf', () => {
});
});

[[], [{ type: 'TextPositionSelector', start: 0, end: 200 }]].forEach(
selectors => {
it('fails to anchor if there is no quote selector', async () => {
let error;
try {
await pdfAnchoring.anchor(container, selectors);
} catch (err) {
error = err;
}
assert.instanceOf(error, Error);
assert.equal(error.message, 'No quote selector found');
});
}
);

it('anchors text in older PDF.js versions', async () => {
initViewer(fixtures.pdfPages, { newTextRendering: false });

Expand Down

0 comments on commit 4770612

Please sign in to comment.