diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py index ff512f7..c9bafc0 100644 --- a/pdfannots/__init__.py +++ b/pdfannots/__init__.py @@ -62,9 +62,13 @@ def _mkannotation( # decode as string, normalise line endings, replace special characters contents = cleanup_text(pdfminer.utils.decode_text(contents)) - coords = pdftypes.resolve1(pa.get('QuadPoints')) + # Rect defines the location of the annotation on the page rect = pdftypes.resolve1(pa.get('Rect')) + # QuadPoints are defined only for "markup" annotations (Highlight, Underline, StrikeOut, + # Squiggly), where they specify the quadrilaterals (boxes) covered by the annotation. + quadpoints = pdftypes.resolve1(pa.get('QuadPoints')) + author = pdftypes.resolve1(pa.get('T')) if author is not None: author = pdfminer.utils.decode_text(author) @@ -80,7 +84,7 @@ def _mkannotation( createds = pdfminer.utils.decode_text(createds) created = decode_datetime(createds) - return Annotation(page, annot_type, coords, rect, + return Annotation(page, annot_type, quadpoints, rect, contents, author=author, created=created) diff --git a/pdfannots/types.py b/pdfannots/types.py index d641050..8bcfaf5 100644 --- a/pdfannots/types.py +++ b/pdfannots/types.py @@ -234,12 +234,20 @@ def update_pageseq(self, line: LTTextLine, pageseq: int) -> None: class AnnotationType(enum.Enum): """A supported PDF annotation type. Enumerant names match the Subtype names of the PDF spec.""" + + # A "sticky note" comment annotation. Text = enum.auto() + + # Markup annotations that apply to one or more regions on the page. Highlight = enum.auto() Squiggly = enum.auto() StrikeOut = enum.auto() Underline = enum.auto() + # A single rectangle, that is abused by some Apple tools to render custom + # highlights. We do not attempt to capture the affected text. + Square = enum.auto() + class Annotation(ObjectWithPos): """ @@ -268,19 +276,19 @@ def __init__( self, page: Page, subtype: AnnotationType, - coords: typing.Optional[typing.Sequence[float]] = None, + quadpoints: typing.Optional[typing.Sequence[float]] = None, rect: typing.Optional[BoxCoords] = None, contents: typing.Optional[str] = None, author: typing.Optional[str] = None, created: typing.Optional[datetime.datetime] = None): - # Construct boxes from coords + # Construct boxes from quadpoints boxes = [] - if coords: - assert len(coords) % 8 == 0 - while coords != []: - (x0, y0, x1, y1, x2, y2, x3, y3) = coords[:8] - coords = coords[8:] + if quadpoints is not None: + assert len(quadpoints) % 8 == 0 + while quadpoints != []: + (x0, y0, x1, y1, x2, y2, x3, y3) = quadpoints[:8] + quadpoints = quadpoints[8:] xvals = [x0, x1, x2, x3] yvals = [y0, y1, y2, y3] box = Box(min(xvals), min(yvals), max(xvals), max(yvals)) diff --git a/tests.py b/tests.py index 7e018ee..4886cbd 100755 --- a/tests.py +++ b/tests.py @@ -133,6 +133,22 @@ def test(self) -> None: self.assertEqual(a.gettext(), 'This is a sample statement.') +class Issue46(ExtractionTestBase): + filename = 'issue46.pdf' + + def test(self) -> None: + self.assertEqual(len(self.annots), 3) + + self.assertEqual(self.annots[0].subtype, AnnotationType.Highlight) + self.assertEqual(self.annots[0].gettext(), 'C – Curate') + + self.assertEqual(self.annots[1].subtype, AnnotationType.Square) + self.assertEqual(self.annots[1].gettext(), None) + + self.assertEqual(self.annots[2].subtype, AnnotationType.Highlight) + self.assertEqual(self.annots[2].gettext(), 'This was a novel idea at the time') + + class Pr24(ExtractionTestBase): filename = 'pr24.pdf' diff --git a/tests/issue46.pdf b/tests/issue46.pdf new file mode 100644 index 0000000..7f53e31 Binary files /dev/null and b/tests/issue46.pdf differ