Skip to content

Commit

Permalink
BUG: Warn when visitor* arguments are ignored (#2845)
Browse files Browse the repository at this point in the history
visitor* function arguments are silently ignored when
extraction_mode="layout".  Document this a bit better and add a warning
when these arguments are ignored.

Closes #2840.
  • Loading branch information
kaos-ocs authored Sep 14, 2024
1 parent c4e95bd commit 78baa8f
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 0 deletions.
14 changes: 14 additions & 0 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2172,19 +2172,24 @@ def extract_text(
default = (0, 90, 180, 270)
note: currently only 0 (up),90 (turned left), 180 (upside down),
270 (turned right)
Silently ignored in "layout" mode.
space_width: force default space width
if not extracted from font (default: 200)
Silently ignored in "layout" mode.
visitor_operand_before: function to be called before processing an operation.
It has four arguments: operator, operand-arguments,
current transformation matrix and text matrix.
Ignored with a warning in "layout" mode.
visitor_operand_after: function to be called after processing an operation.
It has four arguments: operator, operand-arguments,
current transformation matrix and text matrix.
Ignored with a warning in "layout" mode.
visitor_text: function to be called when extracting some text at some position.
It has five arguments: text, current transformation matrix,
text matrix, font-dictionary and font-size.
The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
Ignored with a warning in "layout" mode.
extraction_mode (Literal["plain", "layout"]): "plain" for legacy functionality,
"layout" for experimental layout mode functionality.
NOTE: orientations, space_width, and visitor_* parameters are NOT respected
Expand Down Expand Up @@ -2213,6 +2218,15 @@ def extract_text(
if extraction_mode not in ["plain", "layout"]:
raise ValueError(f"Invalid text extraction mode '{extraction_mode}'")
if extraction_mode == "layout":
for visitor in ("visitor_operand_before",
"visitor_operand_after",
"visitor_text",
):
if locals()[visitor]:
logger_warning(
f"Argument {visitor} is ignored in layout mode",
__name__,
)
return self._layout_mode_text(
space_vertically=kwargs.get("layout_mode_space_vertically", True),
scale_weight=kwargs.get("layout_mode_scale_weight", 1.25),
Expand Down
16 changes: 16 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
from io import BytesIO
from pathlib import Path
from unittest.mock import patch

import pytest

Expand Down Expand Up @@ -173,3 +174,18 @@ def test_layout_mode_indirect_sequence_font_widths():
with pytest.raises(ParseError) as exc:
reader.pages[0].extract_text(extraction_mode="layout")
assert str(exc.value).startswith("Invalid font width definition")

def dummy_visitor_text(text, ctm, tm, fd, fs):
pass

@patch("pypdf._page.logger_warning")
def test_layout_mode_warnings(mock_logger_warning):
# Check that a warning is issued when an argument is ignored
reader = PdfReader(RESOURCE_ROOT / "hello-world.pdf")
page = reader.pages[0]
page.extract_text(extraction_mode="plain", visitor_text=dummy_visitor_text)
mock_logger_warning.assert_not_called()
page.extract_text(extraction_mode="layout", visitor_text=dummy_visitor_text)
mock_logger_warning.assert_called_with(
"Argument visitor_text is ignored in layout mode", "pypdf._page"
)

0 comments on commit 78baa8f

Please sign in to comment.