Skip to content

Commit

Permalink
ROB: Prevent excessive layout mode text output from Type3 fonts (#3082)
Browse files Browse the repository at this point in the history
Partially addresses #3081 by checking for a '/ToUnicode' map in Type3 font dictionaries. If no such map is present, check to see if the font is using standard Adobe glyph names. If not, mark the font as 'uninterpretable' and prevent collection of text content from any text operations associated with the font.
  • Loading branch information
shartzog authored Jan 27, 2025
1 parent cf09861 commit 633d188
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 0 deletions.
9 changes: 9 additions & 0 deletions pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ def recurs_to_target_op(
): # ... build text from new Tj operators
if strip_rotated and _tj.rotated:
continue
if not _tj.font.interpretable: # generates warning
continue
# if the y position of the text is greater than the font height, assume
# the text is on a new line and start a new group
if abs(_tj.ty - last_ty) > _tj.font_height:
Expand Down Expand Up @@ -272,6 +274,7 @@ def text_show_operations(
tj_debug: List[TextStateParams] = [] # Tj/TJ operator data (debug only)
try:
warned_rotation = False
warned_uninterpretable_font = False
while True:
operands, op = next(ops)
if op in (b"BT", b"q"):
Expand All @@ -290,6 +293,12 @@ def text_show_operations(
"Rotated text discovered. Layout will be degraded.",
__name__,
)
if not warned_uninterpretable_font and any(not tj.font.interpretable for tj in tjs):
warned_uninterpretable_font = True
logger_warning(
"PDF contains an uninterpretable font. Output will be incomplete.",
__name__,
)
bt_groups.extend(bts)
if debug: # pragma: no cover
tj_debug.extend(tjs)
Expand Down
19 changes: 19 additions & 0 deletions pypdf/_text_extraction/_layout_mode/_font.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dataclasses import dataclass, field
from typing import Any, Dict, Sequence, Union, cast

from ..._codecs import adobe_glyphs
from ...errors import ParseError
from ...generic import IndirectObject
from ._font_widths import STANDARD_WIDTHS
Expand All @@ -19,6 +20,10 @@ class Font:
encoding (str | Dict[int, str]): font encoding
char_map (dict): character map
font_dictionary (dict): font dictionary
width_map (Dict[str, int]): mapping of characters to widths
interpretable (bool): Default True. If False, the font glyphs cannot
be translated to characters, e.g. Type3 fonts that do not define
a '/ToUnicode' mapping.
"""

Expand All @@ -28,8 +33,22 @@ class Font:
char_map: Dict[Any, Any]
font_dictionary: Dict[Any, Any]
width_map: Dict[str, int] = field(default_factory=dict, init=False)
interpretable: bool = True

def __post_init__(self) -> None:
# Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
# reliably converted into character codes unless all named chars
# in /CharProcs map to a standard adobe glyph. See § 9.10.2 of the
# PDF 1.7 standard.
if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
self.interpretable = all(
cname in adobe_glyphs
for cname in self.font_dictionary.get("/CharProcs") or []
)

if not self.interpretable: # save some overhead if font is not interpretable
return

# TrueType fonts have a /Widths array mapping character codes to widths
if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
first_char = self.font_dictionary.get("/FirstChar", 0)
Expand Down
15 changes: 15 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,24 @@ def test_layout_mode_font_class_to_dict():
"space_width": 8,
"subtype": "foo",
"width_map": {},
"interpretable": True,
}


@pytest.mark.enable_socket
@patch("pypdf._text_extraction._layout_mode._fixed_width_page.logger_warning")
def test_uninterpretable_type3_font(mock_logger_warning):
url = "https://github.com/user-attachments/files/18551904/UninterpretableType3Font.pdf"
name = "UninterpretableType3Font.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
assert page.extract_text(extraction_mode="layout") == ""
mock_logger_warning.assert_called_with(
"PDF contains an uninterpretable font. Output will be incomplete.",
"pypdf._text_extraction._layout_mode._fixed_width_page"
)


@pytest.mark.enable_socket
def test_layout_mode_epic_page_fonts():
url = "https://github.com/py-pdf/pypdf/files/13836944/Epic.Page.PDF"
Expand Down

0 comments on commit 633d188

Please sign in to comment.