ROB: Prevent excessive layout mode text output from Type3 fonts (#3082)

Partially addresses #3081 by checking for a '/ToUnicode' map in Type3 font dictionaries. If no such map is present, check to see if the font is using standard Adobe glyph names. If not, mark the font as 'uninterpretable' and prevent collection of text content from any text operations associated with the font.
py-pdf · Jan 27, 2025 · 633d188 · 633d188
1 parent cf09861
commit 633d188
Show file tree

Hide file tree

Showing 3 changed files with 43 additions and 0 deletions.
diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -110,6 +110,8 @@ def recurs_to_target_op(
                 ):  # ... build text from new Tj operators
                     if strip_rotated and _tj.rotated:
                         continue
+                    if not _tj.font.interpretable:  # generates warning
+                        continue
                     # if the y position of the text is greater than the font height, assume
                     # the text is on a new line and start a new group
                     if abs(_tj.ty - last_ty) > _tj.font_height:
@@ -272,6 +274,7 @@ def text_show_operations(
     tj_debug: List[TextStateParams] = []  # Tj/TJ operator data (debug only)
     try:
         warned_rotation = False
+        warned_uninterpretable_font = False
         while True:
             operands, op = next(ops)
             if op in (b"BT", b"q"):
@@ -290,6 +293,12 @@ def text_show_operations(
                             "Rotated text discovered. Layout will be degraded.",
                             __name__,
                         )
+                if not warned_uninterpretable_font and any(not tj.font.interpretable for tj in tjs):
+                    warned_uninterpretable_font = True
+                    logger_warning(
+                        "PDF contains an uninterpretable font. Output will be incomplete.",
+                        __name__,
+                    )
                 bt_groups.extend(bts)
                 if debug:  # pragma: no cover
                     tj_debug.extend(tjs)

diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, Sequence, Union, cast
 
+from ..._codecs import adobe_glyphs
 from ...errors import ParseError
 from ...generic import IndirectObject
 from ._font_widths import STANDARD_WIDTHS
@@ -19,6 +20,10 @@ class Font:
         encoding (str | Dict[int, str]): font encoding
         char_map (dict): character map
         font_dictionary (dict): font dictionary
+        width_map (Dict[str, int]): mapping of characters to widths
+        interpretable (bool): Default True. If False, the font glyphs cannot
+            be translated to characters, e.g. Type3 fonts that do not define
+            a '/ToUnicode' mapping.
 
     """
 
@@ -28,8 +33,22 @@ class Font:
     char_map: Dict[Any, Any]
     font_dictionary: Dict[Any, Any]
     width_map: Dict[str, int] = field(default_factory=dict, init=False)
+    interpretable: bool = True
 
     def __post_init__(self) -> None:
+        # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
+        # reliably converted into character codes unless all named chars
+        # in /CharProcs map to a standard adobe glyph. See § 9.10.2 of the
+        # PDF 1.7 standard.
+        if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
+            self.interpretable = all(
+                cname in adobe_glyphs
+                for cname in self.font_dictionary.get("/CharProcs") or []
+            )
+
+        if not self.interpretable:  # save some overhead if font is not interpretable
+            return
+
         # TrueType fonts have a /Widths array mapping character codes to widths
         if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
             first_char = self.font_dictionary.get("/FirstChar", 0)

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -128,9 +128,24 @@ def test_layout_mode_font_class_to_dict():
         "space_width": 8,
         "subtype": "foo",
         "width_map": {},
+        "interpretable": True,
     }
 
 
+@pytest.mark.enable_socket
+@patch("pypdf._text_extraction._layout_mode._fixed_width_page.logger_warning")
+def test_uninterpretable_type3_font(mock_logger_warning):
+    url = "https://github.com/user-attachments/files/18551904/UninterpretableType3Font.pdf"
+    name = "UninterpretableType3Font.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+    assert page.extract_text(extraction_mode="layout") == ""
+    mock_logger_warning.assert_called_with(
+        "PDF contains an uninterpretable font. Output will be incomplete.",
+        "pypdf._text_extraction._layout_mode._fixed_width_page"
+    )
+
+
 @pytest.mark.enable_socket
 def test_layout_mode_epic_page_fonts():
     url = "https://github.com/py-pdf/pypdf/files/13836944/Epic.Page.PDF"