From 9c6660a9812e3c7365ae8eaed08bfbcebc9d68a2 Mon Sep 17 00:00:00 2001 From: Andrew Baumann Date: Sat, 27 Nov 2021 13:09:00 -0800 Subject: [PATCH] sort annotations by proximity to *either* LTTextLine *or* LTFigure components issue #48 demonstrates a PDF where all text is chars within a figure, and there are no lines/boxes --- pdfannots/__init__.py | 21 +++++++++++---------- pdfannots/types.py | 16 ++++++++-------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/pdfannots/__init__.py b/pdfannots/__init__.py index 9771168..00001eb 100644 --- a/pdfannots/__init__.py +++ b/pdfannots/__init__.py @@ -12,8 +12,8 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage -from pdfminer.layout import ( - LAParams, LTContainer, LTAnno, LTChar, LTPage, LTTextBox, LTTextLine, LTItem, LTComponent) +from pdfminer.layout import (LAParams, LTAnno, LTChar, LTComponent, LTContainer, LTFigure, LTItem, + LTPage, LTTextBox, LTTextLine) from pdfminer.converter import PDFLayoutAnalyzer from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines @@ -213,7 +213,7 @@ class _PDFProcessor(PDFLayoutAnalyzer): # type:ignore page: typing.Optional[Page] # Page being processed. charseq: int # Character sequence number within the page. - lineseq: int # Line sequence number within the page. + compseq: int # Component sequence number within the page. recent_text: typing.Deque[str] # Rotating buffer of recent text, for context. _lasthit: typing.FrozenSet[Annotation] # Annotations hit by the most recent character. _curline: typing.Set[Annotation] # Annotations hit somewhere on the current line. @@ -233,7 +233,7 @@ def __init__(self, rsrcmgr: PDFResourceManager, laparams: LAParams): def clear(self) -> None: """Reset our internal per-page state.""" self.charseq = 0 - self.lineseq = 0 + self.compseq = 0 self.recent_text.clear() self.context_subscribers.clear() self._lasthit = frozenset() @@ -261,13 +261,13 @@ def receive_layout(self, ltpage: LTPage) -> None: self.page = None - def update_lineseq(self, line: LTTextLine) -> None: + def update_pageseq(self, component: LTComponent) -> None: """Assign sequence numbers for objects on the page based on the nearest line of text.""" assert self.page is not None - self.lineseq += 1 + self.compseq += 1 for x in itertools.chain(self.page.annots, self.page.outlines): - x.update_pageseq(line, self.lineseq) + x.update_pageseq(component, self.compseq) def test_boxes(self, item: LTComponent) -> None: """Update the set of annotations whose boxes intersect with the area of the given item.""" @@ -344,9 +344,10 @@ def render(self, item: LTItem) -> None: Ref: https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html """ - # Assign sequence numbers to items on the page based on their proximity to lines of text. - if isinstance(item, LTTextLine): - self.update_lineseq(item) + # Assign sequence numbers to items on the page based on their proximity to lines of text or + # to figures (which may contain bare LTChar elements). + if isinstance(item, (LTTextLine, LTFigure)): + self.update_pageseq(item) # If it's a container, recurse on nested items. if isinstance(item, LTContainer): diff --git a/pdfannots/types.py b/pdfannots/types.py index 8bcfaf5..27c22f4 100644 --- a/pdfannots/types.py +++ b/pdfannots/types.py @@ -4,7 +4,7 @@ import logging import typing -from pdfminer.layout import LTComponent, LTText, LTTextLine +from pdfminer.layout import LTComponent, LTText from pdfminer.pdftypes import PDFObjRef from .utils import merge_lines @@ -199,15 +199,15 @@ def item_hit(self, item: LTComponent) -> bool: and self.y >= item.y0 and self.y <= item.y1) - def update_pageseq(self, line: LTTextLine, pageseq: int) -> None: - """If close-enough to the text line, adopt its sequence number.""" + def update_pageseq(self, component: LTComponent, pageseq: int) -> None: + """If close-enough to the given component, adopt its sequence number.""" assert pageseq > 0 - if self.item_hit(line): - # This pos is inside the line area + if self.item_hit(component): + # This pos is inside the component area self._pageseq = pageseq self._pageseq_distance = 0 else: - d = Box.from_item(line).square_of_distance_to_closest_point((self.x, self.y)) + d = Box.from_item(component).square_of_distance_to_closest_point((self.x, self.y)) if self._pageseq == 0 or self._pageseq_distance > d: self._pageseq = pageseq self._pageseq_distance = d @@ -226,10 +226,10 @@ def __lt__(self, other: typing.Any) -> bool: return self.pos < other.pos return NotImplemented - def update_pageseq(self, line: LTTextLine, pageseq: int) -> None: + def update_pageseq(self, component: LTComponent, pageseq: int) -> None: """Delegates to Pos.update_pageseq""" if self.pos is not None: - self.pos.update_pageseq(line, pageseq) + self.pos.update_pageseq(component, pageseq) class AnnotationType(enum.Enum):