py-pdf · stefan6419846 · Jan 27, 2025 · Jan 26, 2025 · Jan 27, 2025 · Jan 27, 2025
diff --git a/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py b/pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
@@ -110,6 +110,8 @@ def recurs_to_target_op(
                 ):  # ... build text from new Tj operators
                     if strip_rotated and _tj.rotated:
                         continue
+                    if not _tj.font.interpretable:  # generates warning
+                        continue
                     # if the y position of the text is greater than the font height, assume
                     # the text is on a new line and start a new group
                     if abs(_tj.ty - last_ty) > _tj.font_height:
@@ -272,6 +274,7 @@ def text_show_operations(
     tj_debug: List[TextStateParams] = []  # Tj/TJ operator data (debug only)
     try:
         warned_rotation = False
+        warned_uninterpretable_font = False
         while True:
             operands, op = next(ops)
             if op in (b"BT", b"q"):
@@ -290,6 +293,12 @@ def text_show_operations(
                             "Rotated text discovered. Layout will be degraded.",
                             __name__,
                         )
+                if not warned_uninterpretable_font and any(not tj.font.interpretable for tj in tjs):
+                    warned_uninterpretable_font = True
+                    logger_warning(
+                        "PDF contains an uninterpretable font. Output will be incomplete.",
+                        __name__,
+                    )
                 bt_groups.extend(bts)
                 if debug:  # pragma: no cover
                     tj_debug.extend(tjs)

diff --git a/pypdf/_text_extraction/_layout_mode/_font.py b/pypdf/_text_extraction/_layout_mode/_font.py
@@ -3,6 +3,7 @@
 from dataclasses import dataclass, field
 from typing import Any, Dict, Sequence, Union, cast
 
+from ..._codecs import adobe_glyphs
 from ...errors import ParseError
 from ...generic import IndirectObject
 from ._font_widths import STANDARD_WIDTHS
@@ -19,6 +20,10 @@ class Font:
         encoding (str | Dict[int, str]): font encoding
         char_map (dict): character map
         font_dictionary (dict): font dictionary
+        width_map (Dict[str, int]): mapping of characters to widths
+        interpretable (bool): Default True. If False, the font glyphs cannot
+            be translated to characters, e.g. Type3 fonts that do not define
+            a '/ToUnicode' mapping.
 
     """
 
@@ -28,8 +33,22 @@ class Font:
     char_map: Dict[Any, Any]
     font_dictionary: Dict[Any, Any]
     width_map: Dict[str, int] = field(default_factory=dict, init=False)
+    interpretable: bool = True
 
     def __post_init__(self) -> None:
+        # Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
+        # reliably converted into character codes unless all named chars
+        # in /CharProcs map to a standard adobe glyph. See § 9.10.2 of the
+        # PDF 1.7 standard.
+        if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
+            self.interpretable = all(
+                cname in adobe_glyphs
+                for cname in self.font_dictionary.get("/CharProcs") or []
+            )
+
+        if not self.interpretable:  # save some overhead if font is not interpretable
+            return
+
         # TrueType fonts have a /Widths array mapping character codes to widths
         if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
             first_char = self.font_dictionary.get("/FirstChar", 0)

diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py
@@ -128,9 +128,24 @@ def test_layout_mode_font_class_to_dict():
         "space_width": 8,
         "subtype": "foo",
         "width_map": {},
+        "interpretable": True,
     }
 
 
+@pytest.mark.enable_socket
+@patch("pypdf._text_extraction._layout_mode._fixed_width_page.logger_warning")
+def test_uninterpretable_type3_font(mock_logger_warning):
+    url = "https://github.com/user-attachments/files/18551904/UninterpretableType3Font.pdf"
+    name = "UninterpretableType3Font.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    page = reader.pages[0]
+    assert page.extract_text(extraction_mode="layout") == ""
+    mock_logger_warning.assert_called_with(
+        "PDF contains an uninterpretable font. Output will be incomplete.",
+        "pypdf._text_extraction._layout_mode._fixed_width_page"
+    )
+
+
 @pytest.mark.enable_socket
 def test_layout_mode_epic_page_fonts():
     url = "https://github.com/py-pdf/pypdf/files/13836944/Epic.Page.PDF"