Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB: Prevent excessive layout mode text output from Type3 fonts #3082

Merged
merged 5 commits into from
Jan 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions pypdf/_text_extraction/_layout_mode/_fixed_width_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ def recurs_to_target_op(
): # ... build text from new Tj operators
if strip_rotated and _tj.rotated:
continue
if not _tj.font.interpretable: # generates warning
continue
# if the y position of the text is greater than the font height, assume
# the text is on a new line and start a new group
if abs(_tj.ty - last_ty) > _tj.font_height:
Expand Down Expand Up @@ -272,6 +274,7 @@ def text_show_operations(
tj_debug: List[TextStateParams] = [] # Tj/TJ operator data (debug only)
try:
warned_rotation = False
warned_uninterpretable_font = False
while True:
operands, op = next(ops)
if op in (b"BT", b"q"):
Expand All @@ -290,6 +293,12 @@ def text_show_operations(
"Rotated text discovered. Layout will be degraded.",
__name__,
)
if not warned_uninterpretable_font and any(not tj.font.interpretable for tj in tjs):
warned_uninterpretable_font = True
logger_warning(
"PDF contains an uninterpretable font. Output will be incomplete.",
__name__,
)
bt_groups.extend(bts)
if debug: # pragma: no cover
tj_debug.extend(tjs)
Expand Down
19 changes: 19 additions & 0 deletions pypdf/_text_extraction/_layout_mode/_font.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from dataclasses import dataclass, field
from typing import Any, Dict, Sequence, Union, cast

from ..._codecs import adobe_glyphs
from ...errors import ParseError
from ...generic import IndirectObject
from ._font_widths import STANDARD_WIDTHS
Expand All @@ -19,6 +20,10 @@ class Font:
encoding (str | Dict[int, str]): font encoding
char_map (dict): character map
font_dictionary (dict): font dictionary
width_map (Dict[str, int]): mapping of characters to widths
interpretable (bool): Default True. If False, the font glyphs cannot
be translated to characters, e.g. Type3 fonts that do not define
a '/ToUnicode' mapping.

"""

Expand All @@ -28,8 +33,22 @@ class Font:
char_map: Dict[Any, Any]
font_dictionary: Dict[Any, Any]
width_map: Dict[str, int] = field(default_factory=dict, init=False)
interpretable: bool = True

def __post_init__(self) -> None:
# Type3 fonts that do not specify a "/ToUnicode" mapping cannot be
# reliably converted into character codes unless all named chars
# in /CharProcs map to a standard adobe glyph. See § 9.10.2 of the
# PDF 1.7 standard.
if self.subtype == "/Type3" and "/ToUnicode" not in self.font_dictionary:
self.interpretable = all(
cname in adobe_glyphs
for cname in self.font_dictionary.get("/CharProcs") or []
)

if not self.interpretable: # save some overhead if font is not interpretable
return

# TrueType fonts have a /Widths array mapping character codes to widths
if isinstance(self.encoding, dict) and "/Widths" in self.font_dictionary:
first_char = self.font_dictionary.get("/FirstChar", 0)
Expand Down
15 changes: 15 additions & 0 deletions tests/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,24 @@ def test_layout_mode_font_class_to_dict():
"space_width": 8,
"subtype": "foo",
"width_map": {},
"interpretable": True,
}


@pytest.mark.enable_socket
@patch("pypdf._text_extraction._layout_mode._fixed_width_page.logger_warning")
def test_uninterpretable_type3_font(mock_logger_warning):
url = "https://github.com/user-attachments/files/18551904/UninterpretableType3Font.pdf"
name = "UninterpretableType3Font.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
page = reader.pages[0]
assert page.extract_text(extraction_mode="layout") == ""
mock_logger_warning.assert_called_with(
"PDF contains an uninterpretable font. Output will be incomplete.",
"pypdf._text_extraction._layout_mode._fixed_width_page"
)


@pytest.mark.enable_socket
def test_layout_mode_epic_page_fonts():
url = "https://github.com/py-pdf/pypdf/files/13836944/Epic.Page.PDF"
Expand Down
Loading