Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle "Square" annotations, but don't attempt to capture text #47

Merged
merged 1 commit into from
Nov 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,13 @@ def _mkannotation(
# decode as string, normalise line endings, replace special characters
contents = cleanup_text(pdfminer.utils.decode_text(contents))

coords = pdftypes.resolve1(pa.get('QuadPoints'))
# Rect defines the location of the annotation on the page
rect = pdftypes.resolve1(pa.get('Rect'))

# QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut,
# Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation.
quadpoints = pdftypes.resolve1(pa.get('QuadPoints'))

author = pdftypes.resolve1(pa.get('T'))
if author is not None:
author = pdfminer.utils.decode_text(author)
Expand All @@ -80,7 +84,7 @@ def _mkannotation(
createds = pdfminer.utils.decode_text(createds)
created = decode_datetime(createds)

return Annotation(page, annot_type, coords, rect,
return Annotation(page, annot_type, quadpoints, rect,
contents, author=author, created=created)


Expand Down
22 changes: 15 additions & 7 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,20 @@ def update_pageseq(self, line: LTTextLine, pageseq: int) -> None:

class AnnotationType(enum.Enum):
"""A supported PDF annotation type. Enumerant names match the Subtype names of the PDF spec."""

# A "sticky note" comment annotation.
Text = enum.auto()

# Markup annotations that apply to one or more regions on the page.
Highlight = enum.auto()
Squiggly = enum.auto()
StrikeOut = enum.auto()
Underline = enum.auto()

# A single rectangle, that is abused by some Apple tools to render custom
# highlights. We do not attempt to capture the affected text.
Square = enum.auto()


class Annotation(ObjectWithPos):
"""
Expand Down Expand Up @@ -268,19 +276,19 @@ def __init__(
self,
page: Page,
subtype: AnnotationType,
coords: typing.Optional[typing.Sequence[float]] = None,
quadpoints: typing.Optional[typing.Sequence[float]] = None,
rect: typing.Optional[BoxCoords] = None,
contents: typing.Optional[str] = None,
author: typing.Optional[str] = None,
created: typing.Optional[datetime.datetime] = None):

# Construct boxes from coords
# Construct boxes from quadpoints
boxes = []
if coords:
assert len(coords) % 8 == 0
while coords != []:
(x0, y0, x1, y1, x2, y2, x3, y3) = coords[:8]
coords = coords[8:]
if quadpoints is not None:
assert len(quadpoints) % 8 == 0
while quadpoints != []:
(x0, y0, x1, y1, x2, y2, x3, y3) = quadpoints[:8]
quadpoints = quadpoints[8:]
xvals = [x0, x1, x2, x3]
yvals = [y0, y1, y2, y3]
box = Box(min(xvals), min(yvals), max(xvals), max(yvals))
Expand Down
16 changes: 16 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,22 @@ def test(self) -> None:
self.assertEqual(a.gettext(), 'This is a sample statement.')


class Issue46(ExtractionTestBase):
filename = 'issue46.pdf'

def test(self) -> None:
self.assertEqual(len(self.annots), 3)

self.assertEqual(self.annots[0].subtype, AnnotationType.Highlight)
self.assertEqual(self.annots[0].gettext(), 'C – Curate')

self.assertEqual(self.annots[1].subtype, AnnotationType.Square)
self.assertEqual(self.annots[1].gettext(), None)

self.assertEqual(self.annots[2].subtype, AnnotationType.Highlight)
self.assertEqual(self.annots[2].gettext(), 'This was a novel idea at the time')


class Pr24(ExtractionTestBase):
filename = 'pr24.pdf'

Expand Down
Binary file added tests/issue46.pdf
Binary file not shown.