From 633d18815d86c76a07a867dddb092093bf6ba242 Mon Sep 17 00:00:00 2001
From: shartzog <47760929+shartzog@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:04:55 -0500
Subject: [PATCH] ROB: Prevent excessive layout mode text output from Type3
 fonts (#3082)

Partially addresses #3081 by checking for a '/ToUnicode' map in Type3 font dictionaries. If no such map is present, check to see if the font is using standard Adobe glyph names. If not, mark the font as 'uninterpretable' and prevent collection of text content from any text operations associated with the font.
---
 .../_layout_mode/_fixed_width_page.py         |  9 +++++++++
 pypdf/_text_extraction/_layout_mode/_font.py  | 19 +++++++++++++++++++
 tests/test_text_extraction.py                 | 15 +++++++++++++++
 3 files changed, 43 insertions(+)

diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
index 4038dfab9..30e501f27 100644
--- a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
+++ b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -110,6 +110,8 @@ def recurs_to_target_op(
                 ):  # ... build text from new Tj operators
                     if strip_rotated and _tj.rotated:
                         continue
+                    if not _tj.font.interpretable:  # generates warning
+                        continue
                     # if the y position of the text is greater than the font height, assume
                     # the text is on a new line and start a new group
                     if abs(_tj.ty - last_ty) > _tj.font_height:
@@ -272,6 +274,7 @@ def text_show_operations(
     tj_debug: List[TextStateParams] = []  # Tj/TJ operator data (debug only)
     try:
         warned_rotation = False
+        warned_uninterpretable_font = False
         while True:
             operands, op = next(ops)
             if op in (b"BT", b"q"):
@@ -290,6 +293,12 @@ def text_show_operations(
                             "Rotated text discovered. Layout will be degraded.",
                             __name__,
                         )
+                if not warned_uninterpretable_font and any(not tj.font.interpretable for tj in tjs):
+                    warned_uninterpretable_font = True
+                    logger_warning(
+                        "PDF contains an uninterpretable font. Output will be incomplete.",
+                        __name__,
+                    )
                 bt_groups.extend(bts)
                 if debug:  # pragma: no cover
                     tj_debug.extend(tjs)
diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
index 4a9b27cad..08946f1d0 100644
--- a/pypdf/_text_extraction/_layout_mode/_font.py
+++ b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, Sequence, Union, cast
 
+from ..._codecs import adobe_glyphs
 from ...errors import ParseError
 from ...generic import IndirectObject
 from ._font_widths import STANDARD_WIDTHS
@@ -19,6 +20,10 @@ class Font:
         encoding (str | Dict[int, str]): font encoding
         char_map (dict): character map
         font_dictionary (dict): font dictionary
+        width_map (Dict[str, int]): mapping of characters to widths
+        interpretable (bool): Default True. If False, the font glyphs cannot
+            be translated to characters, e.g. Type3 fonts that do not define
+            a '/ToUnicode' mapping.
 
     """
 
@@ -28,8 +33,22 @@ class Font:
     char_map: Dict[Any, Any]
     font_dictionary: Dict[Any, Any]
     width_map: Dict[str, int] = field(default_factory=dict, init=False)
+    interpretable: bool = True
 
     def __post_init__(self) -> None:
+        # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
+        # reliably converted into character codes unless all named chars
+        # in /CharProcs map to a standard adobe glyph. See § 9.10.2 of the
+        # PDF 1.7 standard.
+        if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
+            self.interpretable = all(
+                cname in adobe_glyphs
+                for cname in self.font_dictionary.get("/CharProcs") or []
+            )
+
+        if not self.interpretable:  # save some overhead if font is not interpretable
+            return
+
         # TrueType fonts have a /Widths array mapping character codes to widths
         if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
             first_char = self.font_dictionary.get("/FirstChar", 0)
diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
index 72a598363..729d00157 100644
--- a/tests/test_text_extraction.py
+++ b/tests/test_text_extraction.py
@@ -128,9 +128,24 @@ def test_layout_mode_font_class_to_dict():
         "space_width": 8,
         "subtype": "foo",
         "width_map": {},
+        "interpretable": True,
     }
 
 
+@pytest.mark.enable_socket
+@patch("pypdf._text_extraction._layout_mode._fixed_width_page.logger_warning")
+def test_uninterpretable_type3_font(mock_logger_warning):
+    url = "https://github.com/user-attachments/files/18551904/UninterpretableType3Font.pdf"
+    name = "UninterpretableType3Font.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+    assert page.extract_text(extraction_mode="layout") == ""
+    mock_logger_warning.assert_called_with(
+        "PDF contains an uninterpretable font. Output will be incomplete.",
+        "pypdf._text_extraction._layout_mode._fixed_width_page"
+    )
+
+
 @pytest.mark.enable_socket
 def test_layout_mode_epic_page_fonts():
     url = "https://github.com/py-pdf/pypdf/files/13836944/Epic.Page.PDF"