Skip to content

Commit 3e17b82

Browse files
authored
Merge pull request #312 from pymupdf/v0.1.7
Version 0.1.7
2 parents 09093c3 + 8ea59e6 commit 3e17b82

File tree

9 files changed

+1833
-25
lines changed

9 files changed

+1833
-25
lines changed

pdf4llm/setup.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,26 +6,29 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9+
version = "0.1.7"
10+
911
classifiers = [
1012
"Development Status :: 5 - Production/Stable",
1113
"Environment :: Console",
1214
"Intended Audience :: Developers",
1315
"Programming Language :: Python :: 3",
1416
"Topic :: Utilities",
1517
]
16-
requires = ["pymupdf4llm==0.0.28"]
18+
19+
requires = [f"pymupdf4llm=={version}"]
1720

1821
setuptools.setup(
1922
name="pdf4llm",
20-
version="0.0.28",
23+
version=version,
2124
author="Artifex",
2225
author_email="[email protected]",
2326
description="PyMuPDF Utilities for LLM/RAG",
2427
packages=setuptools.find_packages(),
2528
long_description=readme,
2629
long_description_content_type="text/markdown",
2730
install_requires=requires,
28-
python_requires=">=3.9",
31+
python_requires=">=3.10",
2932
license="Dual Licensed - GNU AFFERO GPL 3.0 or Artifex Commercial License",
3033
url="https://github.com/pymupdf/RAG",
3134
classifiers=classifiers,

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 121 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,132 @@
1-
import pymupdf
2-
from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
1+
try:
2+
import pymupdf.layout
3+
except ImportError:
4+
import pymupdf
5+
36
from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION
47

58
if tuple(map(int, pymupdf.__version__.split("."))) < MINIMUM_PYMUPDF_VERSION:
6-
raise ImportError(f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}")
9+
raise ImportError(
10+
f"Requires PyMuPDF v. {MINIMUM_PYMUPDF_VERSION}, but you have {pymupdf.__version__}"
11+
)
712

813
__version__ = VERSION
914
version = VERSION
1015
version_tuple = tuple(map(int, version.split(".")))
1116

17+
if not callable(pymupdf._get_layout):
18+
from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
19+
20+
pymupdf._warn_layout_once() # recommend pymupdf_layout
21+
22+
else:
23+
from .helpers import document_layout as DL
24+
25+
def parse_document(
26+
doc,
27+
filename="",
28+
image_dpi=150,
29+
image_format="png",
30+
image_path="",
31+
pages=None,
32+
):
33+
return DL.parse_document(
34+
doc,
35+
filename=filename,
36+
image_dpi=image_dpi,
37+
image_format=image_format,
38+
image_path=image_path,
39+
pages=pages,
40+
)
41+
42+
def to_markdown(
43+
doc,
44+
*,
45+
header=True,
46+
footer=True,
47+
pages=None,
48+
hdr_info=None,
49+
write_images=False,
50+
embed_images=False,
51+
ignore_images=False,
52+
ignore_graphics=False,
53+
detect_bg_color=True,
54+
image_path="",
55+
image_format="png",
56+
image_size_limit=0.05,
57+
filename="",
58+
force_text=True,
59+
page_chunks=False,
60+
page_separators=False,
61+
margins=0,
62+
dpi=150,
63+
page_width=612,
64+
page_height=None,
65+
table_strategy="lines_strict",
66+
graphics_limit=None,
67+
fontsize_limit=3,
68+
ignore_code=False,
69+
extract_words=False,
70+
show_progress=False,
71+
use_glyphs=False,
72+
ignore_alpha=False,
73+
):
74+
parsed_doc = parse_document(
75+
doc,
76+
filename=filename,
77+
image_dpi=dpi,
78+
image_format=image_format,
79+
image_path=image_path,
80+
pages=pages,
81+
)
82+
return parsed_doc.to_markdown(
83+
header=header,
84+
footer=footer,
85+
write_images=write_images,
86+
embed_images=embed_images,
87+
ignore_code=ignore_code,
88+
)
89+
90+
def to_json(
91+
doc,
92+
header=True,
93+
footer=True,
94+
image_dpi=150,
95+
image_format="png",
96+
image_path="",
97+
pages=None,
98+
):
99+
parsed_doc = parse_document(
100+
doc,
101+
image_dpi=image_dpi,
102+
image_format=image_format,
103+
image_path=image_path,
104+
pages=pages,
105+
)
106+
return parsed_doc.to_json()
107+
108+
def to_text(
109+
doc,
110+
filename="",
111+
header=True,
112+
footer=True,
113+
pages=None,
114+
ignore_code=False,
115+
):
116+
parsed_doc = parse_document(
117+
doc,
118+
filename=filename,
119+
image_dpi=150,
120+
image_format="png",
121+
image_path="",
122+
pages=pages,
123+
)
124+
return parsed_doc.to_text(
125+
header=header,
126+
footer=footer,
127+
ignore_code=ignore_code,
128+
)
129+
12130

13131
def LlamaMarkdownReader(*args, **kwargs):
14132
from .llama import pdf_markdown_reader
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import pymupdf # PyMuPDF
2+
import numpy as np
3+
import cv2
4+
5+
6+
WHITE_CHARS = set(
7+
[chr(i) for i in range(33)]
8+
+ [
9+
"\u00a0", # Non-breaking space
10+
"\u2000", # En quad
11+
"\u2001", # Em quad
12+
"\u2002", # En space
13+
"\u2003", # Em space
14+
"\u2004", # Three-per-em space
15+
"\u2005", # Four-per-em space
16+
"\u2006", # Six-per-em space
17+
"\u2007", # Figure space
18+
"\u2008", # Punctuation space
19+
"\u2009", # Thin space
20+
"\u200a", # Hair space
21+
"\u202f", # Narrow no-break space
22+
"\u205f", # Medium mathematical space
23+
"\u3000", # Ideographic space
24+
]
25+
)
26+
27+
28+
def detect_qr_codes(img):
29+
detector = cv2.QRCodeDetector()
30+
data, points, _ = detector.detectAndDecode(img)
31+
32+
if points is not None and data:
33+
pts = points[0].astype(int)
34+
return {"data": data, "bbox": pts.tolist()}
35+
return None
36+
37+
38+
def detect_barcodes(img):
39+
try:
40+
from pyzbar.pyzbar import decode as barcode_decode
41+
except ImportError:
42+
raise ImportError("pyzbar is required for barcode detection")
43+
gray = img
44+
barcodes = barcode_decode(gray)
45+
results = []
46+
47+
for barcode in barcodes:
48+
results.append(
49+
{
50+
"type": barcode.type,
51+
"data": barcode.data.decode("utf-8"),
52+
"bbox": [(p.x, p.y) for p in barcode.polygon],
53+
}
54+
)
55+
return results
56+
57+
58+
def get_page_image(page, dpi=150):
59+
pix = page.get_pixmap(dpi=dpi)
60+
matrix = pymupdf.Rect(pix.irect).torect(page.rect)
61+
img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
62+
pix.height, pix.width, pix.n
63+
)
64+
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
65+
return gray, matrix, pix
66+
67+
68+
def detect_lines(img, min_length=50, max_gap=10, matrix=pymupdf.Identity):
69+
gray = img
70+
edges = cv2.Canny(gray, 50, 150, apertureSize=3)
71+
pix_lines = cv2.HoughLinesP(
72+
edges,
73+
1,
74+
np.pi / 180,
75+
threshold=100,
76+
minLineLength=min_length,
77+
maxLineGap=max_gap,
78+
)
79+
lines = []
80+
for np_linesr in pix_lines:
81+
for r in np_linesr:
82+
p0 = pymupdf.Point(r[0], r[1]) * matrix
83+
p1 = pymupdf.Point(r[2], r[3]) * matrix
84+
lines.append((p0, p1))
85+
return lines # array of (point1, point2)
86+
87+
88+
def detect_curves(img, matrix=pymupdf.Identity):
89+
gray = img
90+
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
91+
contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
92+
93+
curves = []
94+
for cnt in contours:
95+
if len(cnt) > 5:
96+
ellipse = cv2.fitEllipse(cnt)
97+
curves.append(ellipse)
98+
return curves
99+
100+
101+
def detect_rectangles(img, min_area=1000, matrix=pymupdf.Identity):
102+
gray
103+
_, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV)
104+
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
105+
106+
rectangles = []
107+
for cnt in contours:
108+
approx = cv2.approxPolyDP(cnt, 0.02 * cv2.arcLength(cnt, True), True)
109+
if len(approx) == 4 and cv2.contourArea(cnt) > min_area:
110+
r = pymupdf.Rect(approx) * matrix
111+
rectangles.append(r)
112+
return rectangles
113+
114+
115+
def should_ocr_page(
116+
page,
117+
dpi=150,
118+
edge_thresh=0.015,
119+
vector_thresh=500,
120+
image_coverage_thresh=0.9,
121+
text_readability_thresh=0.9,
122+
):
123+
"""
124+
Decide whether a PyMuPDF page should be OCR'd.
125+
126+
Parameters:
127+
page: PyMuPDF page object
128+
dpi: DPI used for rasterization
129+
edge_thresh: minimum edge density to suggest text presence
130+
vector_thresh: minimum number of vector paths to suggest glyph simulation
131+
image_coverage_thresh: fraction of page area covered by images to trigger OCR
132+
text_readability_thresh: fraction of readable characters to skip OCR
133+
134+
Returns:
135+
dict with decision and diagnostic flags
136+
"""
137+
decision = {
138+
"should_ocr": False,
139+
"has_ocr_text": False,
140+
"has_text": False,
141+
"readable_text": False,
142+
"image_covers_page": False,
143+
"has_vector_drawings": False,
144+
"transform": pymupdf.Identity,
145+
"pixmap": None,
146+
"image": None,
147+
"edge_density": 0.0,
148+
"vector_count": 0,
149+
}
150+
page_rect = page.rect
151+
page_area = abs(page_rect) # size of the full page
152+
# Check for text
153+
text = page.get_text(flags=0)
154+
decision["has_text"] = not WHITE_CHARS.issuperset(text)
155+
if decision["has_text"]:
156+
not_readable_count = len([c for c in text if c == chr(0xFFFD)])
157+
readability = 1 - not_readable_count / len(text)
158+
decision["readable_text"] = readability >= text_readability_thresh
159+
160+
all_text_bboxes = [b for b in page.get_bboxlog() if "text" in b[0]]
161+
ocr_text_bboxes = [b for b in all_text_bboxes if b[0] == "ignore-text"]
162+
decision["has_ocr_text"] = bool(ocr_text_bboxes)
163+
# Check for image coverage
164+
image_rects=[page_rect&img["bbox"] for img in page.get_image_info()]
165+
image_rect=pymupdf.EMPTY_RECT()
166+
for r in image_rects:
167+
image_rect|=r
168+
image_area=abs(image_rect)
169+
if image_area:
170+
images_cover = image_area / page_area
171+
else:
172+
images_cover = 0.0
173+
decision["image_covers_page"] = images_cover >= image_coverage_thresh
174+
175+
# Check vector drawings
176+
drawings = [
177+
p for p in page.get_drawings() if p["rect"].width > 3 or p["rect"].height > 3
178+
]
179+
decision["vector_count"] = len(drawings)
180+
decision["has_vector_drawings"] = len(drawings) >= vector_thresh
181+
182+
# Rasterize and analyze edge density
183+
img, matrix, pix = get_page_image(page, dpi=dpi)
184+
decision["transform"] = matrix
185+
decision["pixmap"] = pix
186+
decision["image"] = img
187+
edges = cv2.Canny(img, 100, 200)
188+
decision["edge_density"] = np.sum(edges > 0) / edges.size
189+
190+
# Final decision
191+
if (
192+
1
193+
and not decision["has_text"]
194+
and not decision["readable_text"]
195+
and (
196+
0
197+
or decision["image_covers_page"]
198+
or decision["has_vector_drawings"]
199+
or decision["edge_density"] > edge_thresh
200+
)
201+
):
202+
decision["should_ocr"] = True
203+
204+
return decision

0 commit comments

Comments
 (0)