Skip to content

Commit

Permalink
sort annotations by proximity to *either* LTTextLine *or* LTFigure co…
Browse files Browse the repository at this point in the history
…mponents

issue #48 demonstrates a PDF where all text is chars within a figure, and there
are no lines/boxes
  • Loading branch information
0xabu committed Nov 27, 2021
1 parent 0d230bf commit 9c6660a
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 18 deletions.
21 changes: 11 additions & 10 deletions pdfannots/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import (
LAParams, LTContainer, LTAnno, LTChar, LTPage, LTTextBox, LTTextLine, LTItem, LTComponent)
from pdfminer.layout import (LAParams, LTAnno, LTChar, LTComponent, LTContainer, LTFigure, LTItem,
LTPage, LTTextBox, LTTextLine)
from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
Expand Down Expand Up @@ -213,7 +213,7 @@ class _PDFProcessor(PDFLayoutAnalyzer): # type:ignore

page: typing.Optional[Page] # Page being processed.
charseq: int # Character sequence number within the page.
lineseq: int # Line sequence number within the page.
compseq: int # Component sequence number within the page.
recent_text: typing.Deque[str] # Rotating buffer of recent text, for context.
_lasthit: typing.FrozenSet[Annotation] # Annotations hit by the most recent character.
_curline: typing.Set[Annotation] # Annotations hit somewhere on the current line.
Expand All @@ -233,7 +233,7 @@ def __init__(self, rsrcmgr: PDFResourceManager, laparams: LAParams):
def clear(self) -> None:
"""Reset our internal per-page state."""
self.charseq = 0
self.lineseq = 0
self.compseq = 0
self.recent_text.clear()
self.context_subscribers.clear()
self._lasthit = frozenset()
Expand Down Expand Up @@ -261,13 +261,13 @@ def receive_layout(self, ltpage: LTPage) -> None:

self.page = None

def update_lineseq(self, line: LTTextLine) -> None:
def update_pageseq(self, component: LTComponent) -> None:
"""Assign sequence numbers for objects on the page based on the nearest line of text."""
assert self.page is not None
self.lineseq += 1
self.compseq += 1

for x in itertools.chain(self.page.annots, self.page.outlines):
x.update_pageseq(line, self.lineseq)
x.update_pageseq(component, self.compseq)

def test_boxes(self, item: LTComponent) -> None:
"""Update the set of annotations whose boxes intersect with the area of the given item."""
Expand Down Expand Up @@ -344,9 +344,10 @@ def render(self, item: LTItem) -> None:
Ref: https://pdfminersix.readthedocs.io/en/latest/topic/converting_pdf_to_text.html
"""
# Assign sequence numbers to items on the page based on their proximity to lines of text.
if isinstance(item, LTTextLine):
self.update_lineseq(item)
# Assign sequence numbers to items on the page based on their proximity to lines of text or
# to figures (which may contain bare LTChar elements).
if isinstance(item, (LTTextLine, LTFigure)):
self.update_pageseq(item)

# If it's a container, recurse on nested items.
if isinstance(item, LTContainer):
Expand Down
16 changes: 8 additions & 8 deletions pdfannots/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
import typing

from pdfminer.layout import LTComponent, LTText, LTTextLine
from pdfminer.layout import LTComponent, LTText
from pdfminer.pdftypes import PDFObjRef

from .utils import merge_lines
Expand Down Expand Up @@ -199,15 +199,15 @@ def item_hit(self, item: LTComponent) -> bool:
and self.y >= item.y0
and self.y <= item.y1)

def update_pageseq(self, line: LTTextLine, pageseq: int) -> None:
"""If close-enough to the text line, adopt its sequence number."""
def update_pageseq(self, component: LTComponent, pageseq: int) -> None:
"""If close-enough to the given component, adopt its sequence number."""
assert pageseq > 0
if self.item_hit(line):
# This pos is inside the line area
if self.item_hit(component):
# This pos is inside the component area
self._pageseq = pageseq
self._pageseq_distance = 0
else:
d = Box.from_item(line).square_of_distance_to_closest_point((self.x, self.y))
d = Box.from_item(component).square_of_distance_to_closest_point((self.x, self.y))
if self._pageseq == 0 or self._pageseq_distance > d:
self._pageseq = pageseq
self._pageseq_distance = d
Expand All @@ -226,10 +226,10 @@ def __lt__(self, other: typing.Any) -> bool:
return self.pos < other.pos
return NotImplemented

def update_pageseq(self, line: LTTextLine, pageseq: int) -> None:
def update_pageseq(self, component: LTComponent, pageseq: int) -> None:
"""Delegates to Pos.update_pageseq"""
if self.pos is not None:
self.pos.update_pageseq(line, pageseq)
self.pos.update_pageseq(component, pageseq)


class AnnotationType(enum.Enum):
Expand Down

0 comments on commit 9c6660a

Please sign in to comment.