From 633d18815d86c76a07a867dddb092093bf6ba242 Mon Sep 17 00:00:00 2001 From: shartzog <47760929+shartzog@users.noreply.github.com> Date: Mon, 27 Jan 2025 11:04:55 -0500 Subject: [PATCH] ROB: Prevent excessive layout mode text output from Type3 fonts (#3082) Partially addresses #3081 by checking for a '/ToUnicode' map in Type3 font dictionaries. If no such map is present, check to see if the font is using standard Adobe glyph names. If not, mark the font as 'uninterpretable' and prevent collection of text content from any text operations associated with the font. --- .../_layout_mode/_fixed_width_page.py | 9 +++++++++ pypdf/_text_extraction/_layout_mode/_font.py | 19 +++++++++++++++++++ tests/test_text_extraction.py | 15 +++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py index 4038dfab9..30e501f27 100644 --- a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py +++ b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py @@ -110,6 +110,8 @@ def recurs_to_target_op( ): # ... build text from new Tj operators if strip_rotated and _tj.rotated: continue + if not _tj.font.interpretable: # generates warning + continue # if the y position of the text is greater than the font height, assume # the text is on a new line and start a new group if abs(_tj.ty - last_ty) > _tj.font_height: @@ -272,6 +274,7 @@ def text_show_operations( tj_debug: List[TextStateParams] = [] # Tj/TJ operator data (debug only) try: warned_rotation = False + warned_uninterpretable_font = False while True: operands, op = next(ops) if op in (b"BT", b"q"): @@ -290,6 +293,12 @@ def text_show_operations( "Rotated text discovered. Layout will be degraded.", __name__, ) + if not warned_uninterpretable_font and any(not tj.font.interpretable for tj in tjs): + warned_uninterpretable_font = True + logger_warning( + "PDF contains an uninterpretable font. Output will be incomplete.", + __name__, + ) bt_groups.extend(bts) if debug: # pragma: no cover tj_debug.extend(tjs) diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py index 4a9b27cad..08946f1d0 100644 --- a/pypdf/_text_extraction/_layout_mode/_font.py +++ b/pypdf/_text_extraction/_layout_mode/_font.py @@ -3,6 +3,7 @@ from dataclasses import dataclass, field from typing import Any, Dict, Sequence, Union, cast +from ..._codecs import adobe_glyphs from ...errors import ParseError from ...generic import IndirectObject from ._font_widths import STANDARD_WIDTHS @@ -19,6 +20,10 @@ class Font: encoding (str | Dict[int, str]): font encoding char_map (dict): character map font_dictionary (dict): font dictionary + width_map (Dict[str, int]): mapping of characters to widths + interpretable (bool): Default True. If False, the font glyphs cannot + be translated to characters, e.g. Type3 fonts that do not define + a '/ToUnicode' mapping. """ @@ -28,8 +33,22 @@ class Font: char_map: Dict[Any, Any] font_dictionary: Dict[Any, Any] width_map: Dict[str, int] = field(default_factory=dict, init=False) + interpretable: bool = True def __post_init__(self) -> None: + # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be + # reliably converted into character codes unless all named chars + # in /CharProcs map to a standard adobe glyph. See ยง 9.10.2 of the + # PDF 1.7 standard. + if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary: + self.interpretable = all( + cname in adobe_glyphs + for cname in self.font_dictionary.get("/CharProcs") or [] + ) + + if not self.interpretable: # save some overhead if font is not interpretable + return + # TrueType fonts have a /Widths array mapping character codes to widths if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary: first_char = self.font_dictionary.get("/FirstChar", 0) diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index 72a598363..729d00157 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -128,9 +128,24 @@ def test_layout_mode_font_class_to_dict(): "space_width": 8, "subtype": "foo", "width_map": {}, + "interpretable": True, } +@pytest.mark.enable_socket +@patch("pypdf._text_extraction._layout_mode._fixed_width_page.logger_warning") +def test_uninterpretable_type3_font(mock_logger_warning): + url = "https://github.com/user-attachments/files/18551904/UninterpretableType3Font.pdf" + name = "UninterpretableType3Font.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + page = reader.pages[0] + assert page.extract_text(extraction_mode="layout") == "" + mock_logger_warning.assert_called_with( + "PDF contains an uninterpretable font. Output will be incomplete.", + "pypdf._text_extraction._layout_mode._fixed_width_page" + ) + + @pytest.mark.enable_socket def test_layout_mode_epic_page_fonts(): url = "https://github.com/py-pdf/pypdf/files/13836944/Epic.Page.PDF"