Skip to content

Commit

Permalink
Handle "Square" annotations, but don't attempt to capture text
Browse files Browse the repository at this point in the history
Per issue #46, Apple tools support "highlighting" PDFs where the hightlights
are emitted as a Square annotation with a custom appearance that renders the
markup. Figuring out the affected text would be a major undertaking, but
with this change we (1) recognise the existence of the annotation rather than
emitting an "unsupported annotation" warning, and (2) capture the contents
(text note) of the annotation if any.
  • Loading branch information
0xabu committed Nov 17, 2021
1 parent 730c3b3 commit 9c7d742
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 9 deletions.
8 changes: 6 additions & 2 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,13 @@ def _mkannotation(
# decode as string, normalise line endings, replace special characters
contents = cleanup_text(pdfminer.utils.decode_text(contents))

coords = pdftypes.resolve1(pa.get('QuadPoints'))
# Rect defines the location of the annotation on the page
rect = pdftypes.resolve1(pa.get('Rect'))

# QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut,
# Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation.
quadpoints = pdftypes.resolve1(pa.get('QuadPoints'))

author = pdftypes.resolve1(pa.get('T'))
if author is not None:
author = pdfminer.utils.decode_text(author)
Expand All @@ -80,7 +84,7 @@ def _mkannotation(
createds = pdfminer.utils.decode_text(createds)
created = decode_datetime(createds)

return Annotation(page, annot_type, coords, rect,
return Annotation(page, annot_type, quadpoints, rect,
contents, author=author, created=created)


Expand Down
22 changes: 15 additions & 7 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,20 @@ def update_pageseq(self, line: LTTextLine, pageseq: int) -> None:

class AnnotationType(enum.Enum):
"""A supported PDF annotation type. Enumerant names match the Subtype names of the PDF spec."""

# A "sticky note" comment annotation.
Text = enum.auto()

# Markup annotations that apply to one or more regions on the page.
Highlight = enum.auto()
Squiggly = enum.auto()
StrikeOut = enum.auto()
Underline = enum.auto()

# A single rectangle, that is abused by some Apple tools to render custom
# highlights. We do not attempt to capture the affected text.
Square = enum.auto()


class Annotation(ObjectWithPos):
"""
Expand Down Expand Up @@ -268,19 +276,19 @@ def __init__(
self,
page: Page,
subtype: AnnotationType,
coords: typing.Optional[typing.Sequence[float]] = None,
quadpoints: typing.Optional[typing.Sequence[float]] = None,
rect: typing.Optional[BoxCoords] = None,
contents: typing.Optional[str] = None,
author: typing.Optional[str] = None,
created: typing.Optional[datetime.datetime] = None):

# Construct boxes from coords
# Construct boxes from quadpoints
boxes = []
if coords:
assert len(coords) % 8 == 0
while coords != []:
(x0, y0, x1, y1, x2, y2, x3, y3) = coords[:8]
coords = coords[8:]
if quadpoints is not None:
assert len(quadpoints) % 8 == 0
while quadpoints != []:
(x0, y0, x1, y1, x2, y2, x3, y3) = quadpoints[:8]
quadpoints = quadpoints[8:]
xvals = [x0, x1, x2, x3]
yvals = [y0, y1, y2, y3]
box = Box(min(xvals), min(yvals), max(xvals), max(yvals))
Expand Down
16 changes: 16 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,22 @@ def test(self) -> None:
self.assertEqual(a.gettext(), 'This is a sample statement.')


class Issue46(ExtractionTestBase):
filename = 'issue46.pdf'

def test(self) -> None:
self.assertEqual(len(self.annots), 3)

self.assertEqual(self.annots[0].subtype, AnnotationType.Highlight)
self.assertEqual(self.annots[0].gettext(), 'C – Curate')

self.assertEqual(self.annots[1].subtype, AnnotationType.Square)
self.assertEqual(self.annots[1].gettext(), None)

self.assertEqual(self.annots[2].subtype, AnnotationType.Highlight)
self.assertEqual(self.annots[2].gettext(), 'This was a novel idea at the time')


class Pr24(ExtractionTestBase):
filename = 'pr24.pdf'

Expand Down
Binary file added tests/issue46.pdf
Binary file not shown.

0 comments on commit 9c7d742

Please sign in to comment.