Skip to content

Commit c9229b7

Browse files
authored
Merge pull request #315 from pymupdf/v0.2.0
Version 0.2.0
2 parents 327e61c + 7043955 commit c9229b7

File tree

6 files changed

+103
-41
lines changed

6 files changed

+103
-41
lines changed

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9-
version = "0.1.9"
9+
version = "0.2.0"
1010

1111
classifiers = [
1212
"Development Status :: 5 - Production/Stable",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
try:
2-
import pymupdf.layout
3-
except ImportError:
4-
import pymupdf
1+
import pymupdf
52

63
from .versions_file import MINIMUM_PYMUPDF_VERSION, VERSION
74

pymupdf4llm/pymupdf4llm/helpers/document_layout.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,9 @@ def parse_document(
713713
utils.clean_pictures(page, blocks)
714714
utils.add_image_orphans(page, blocks)
715715
utils.clean_tables(page, blocks)
716-
page.layout_information = utils.find_reading_order(page.layout_information)
716+
page.layout_information = utils.find_reading_order(
717+
page.rect, blocks, page.layout_information
718+
)
717719

718720
# identify vector graphics to help find tables
719721
all_lines, all_boxes = utils.complete_table_structure(page)

pymupdf4llm/pymupdf4llm/helpers/utils.py

Lines changed: 96 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -210,19 +210,23 @@ def add_image_orphans(page, blocks):
210210
"""
211211

212212

213-
def cluster_stripes(boxes, vertical_gap: float = 12):
213+
def cluster_stripes(boxes, joined_boxes, vectors, vertical_gap=12):
214214
"""
215215
Divide page into horizontal stripes based on vertical gaps.
216216
217217
Args:
218-
boxes (list): List of bounding boxes, each defined as (x0, y0, x1, y1).
218+
boxes (list): List of bounding boxes.
219219
vertical_gap (float): Minimum vertical gap to separate stripes.
220220
221221
Returns:
222222
List of disjoint horizontal stripes. Each stripe is a list of boxes.
223223
"""
224224

225225
def is_multi_column_layout(boxes):
226+
"""Check if the boxes have a clean multi-column layout.
227+
228+
Used to early exit from stripe clustering.
229+
"""
226230
sorted_boxes = sorted(boxes, key=lambda b: b[0])
227231
columns = []
228232
current_column = [sorted_boxes[0]]
@@ -236,58 +240,86 @@ def is_multi_column_layout(boxes):
236240
columns.append(current_column)
237241
return len(columns) > 1
238242

243+
def divider(y, box, vertical_gap):
244+
"""Create a rectangle of box width and vertical_gap height below y."""
245+
r = pymupdf.Rect(box[0], y, box[2], y + vertical_gap)
246+
return r
247+
239248
# Sort top to bottom
240-
sorted_boxes = sorted(boxes, key=lambda b: b[1])
249+
sorted_boxes = sorted(boxes, key=lambda b: b[3])
241250
stripes = []
251+
252+
# exit if no boxes
242253
if not sorted_boxes:
243254
return stripes
244255

245-
# Early exit for clean multi-column layouts
246-
if is_multi_column_layout(sorted_boxes):
256+
# Exit if clean multi-column layout: treat full page as single stripe.
257+
if is_multi_column_layout(boxes):
247258
return [boxes]
248259

249-
current_stripe = [sorted_boxes[0]]
250-
251-
for box in sorted_boxes[1:]:
252-
prev_bottom = max(b[3] for b in current_stripe)
253-
if box[1] - prev_bottom > vertical_gap:
260+
# y-borders of horizontal stripes
261+
y_values = {joined_boxes.y1}
262+
for box in sorted_boxes:
263+
# find empty horizontal dividers of minimum height 'vertical_gap'
264+
y = box[3]
265+
if y >= joined_boxes.y1:
266+
continue
267+
div = divider(y, joined_boxes, vertical_gap)
268+
if not any(div.intersects(pymupdf.Rect(b[:4])) for b in boxes):
269+
# look for next bbox below the divider
270+
y0 = min(b[1] for b in sorted_boxes if b[1] >= div.y1)
271+
div.y1 = y0 # divider has this bottom now
272+
inter_count = 0 # counts intersections with vectors
273+
274+
# if divider is fully contained in more than one vector's stripe
275+
# we don't consider it.
276+
for vr in vectors:
277+
if div.intersects(vr) and vr.y0 <= div.y0 and div.y1 <= vr.y1:
278+
inter_count += 1
279+
if inter_count <= 1:
280+
y_values.add(div.y1)
281+
y_values = sorted(y_values)
282+
current_stripe = []
283+
for y in y_values:
284+
while sorted_boxes and sorted_boxes[0][3] <= y:
285+
current_stripe.append(sorted_boxes.pop(0))
286+
if current_stripe:
254287
stripes.append(current_stripe)
255-
current_stripe = [box]
256-
else:
257-
current_stripe.append(box)
258-
259-
stripes.append(current_stripe)
288+
current_stripe = []
260289
return stripes
261290

262291

263-
def cluster_columns_in_stripe(stripe: list):
292+
def cluster_columns_in_stripe(stripe):
264293
"""
265294
Within a stripe, group boxes into columns based on horizontal proximity.
266295
296+
We use a small horizontal gap threshold to decide when a new column starts.
297+
267298
Args:
268-
stripe (list): List of boxes within a stripe.
299+
stripe (list): List of boxes we look at here.
269300
270301
Returns:
271302
list: List of columns, each column is a list of boxes.
272303
"""
304+
HORIZONTAL_GAP = 1 # allowable gap to start a new column
273305
# Sort left to right
274306
sorted_boxes = sorted(stripe, key=lambda b: b[0])
275307
columns = []
276308
current_column = [sorted_boxes[0]]
277309

278310
for box in sorted_boxes[1:]:
279311
prev_right = max([b[2] for b in current_column])
280-
if box[0] - prev_right > 1:
281-
columns.append(sorted(current_column, key=lambda b: b[3]))
312+
if box[0] - prev_right > HORIZONTAL_GAP:
313+
columns.append(sorted(current_column, key=lambda b: b[1]))
282314
current_column = [box]
283315
else:
284316
current_column.append(box)
285317

286-
columns.append(sorted(current_column, key=lambda b: b[3]))
318+
columns.append(sorted(current_column, key=lambda b: b[1]))
287319
return columns
288320

289321

290-
def compute_reading_order(boxes, vertical_gap: float = 12):
322+
def compute_reading_order(boxes, joined_boxes, vectors, vertical_gap=12):
291323
"""
292324
Compute reading order of boxes delivered by PyMuPDF-Layout.
293325
@@ -298,12 +330,12 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
298330
Returns:
299331
list: List of boxes in reading order.
300332
"""
301-
# compute adequate vertical_gap based height of union of bboxes
302-
temp = pymupdf.EMPTY_RECT()
303-
for b in boxes:
304-
temp |= pymupdf.Rect(b[:4])
305-
this_vertical_gap = vertical_gap * temp.height / 800
306-
stripes = cluster_stripes(boxes, vertical_gap=this_vertical_gap)
333+
stripes = cluster_stripes(
334+
boxes,
335+
joined_boxes,
336+
vectors,
337+
vertical_gap=vertical_gap,
338+
)
307339
ordered = []
308340
for stripe in stripes:
309341
columns = cluster_columns_in_stripe(stripe)
@@ -312,7 +344,7 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
312344
return ordered
313345

314346

315-
def find_reading_order(boxes, vertical_gap: float = 36) -> list:
347+
def find_reading_order(page_rect, blocks, boxes, vertical_gap: float = 12) -> list:
316348
"""Given page layout information, return the boxes in reading order.
317349
318350
Args:
@@ -326,6 +358,9 @@ def find_reading_order(boxes, vertical_gap: float = 36) -> list:
326358
List of boxes in reading order.
327359
"""
328360

361+
# compute adequate vertical_gap based on the height the page rectangle
362+
this_vertical_gap = vertical_gap * page_rect.height / 800
363+
329364
def is_contained(inner, outer) -> bool:
330365
"""Check if inner box is fully contained within outer box."""
331366
return (
@@ -369,9 +404,28 @@ def filter_contained(boxes) -> list:
369404
else:
370405
body_boxes.append(box)
371406

372-
# bring body into reading order
373-
ordered = compute_reading_order(body_boxes, vertical_gap=vertical_gap)
407+
# compute joined boxes of body
408+
joined_boxes = pymupdf.Rect(
409+
min(b[0] for b in body_boxes),
410+
min(b[1] for b in body_boxes),
411+
max(b[2] for b in body_boxes),
412+
max(b[3] for b in body_boxes),
413+
)
374414

415+
# extract vectors contained in the TextPage
416+
min_bbox_height = min(b[3] - b[1] for b in body_boxes)
417+
vectors = [
418+
pymupdf.Rect(b["bbox"])
419+
for b in blocks
420+
if b["bbox"][3] - b["bbox"][1] >= min_bbox_height and b["bbox"] in joined_boxes
421+
]
422+
# bring body into reading order
423+
ordered = compute_reading_order(
424+
body_boxes,
425+
joined_boxes,
426+
vectors,
427+
vertical_gap=this_vertical_gap,
428+
)
375429
# Final full boxes list. We do simple sorts for non-body boxes.
376430
final = (
377431
sorted(page_headers, key=lambda r: (r[1], r[0]))
@@ -382,6 +436,8 @@ def filter_contained(boxes) -> list:
382436

383437

384438
def simplify_vectors(vectors):
439+
"""Join vectors that are horizontally adjacent and vertically aligned."""
440+
Y_TOLERANCE = 1 # allowable top / bottom difference
385441
new_vectors = []
386442
if not vectors:
387443
return new_vectors
@@ -390,8 +446,8 @@ def simplify_vectors(vectors):
390446
last_v = new_vectors[-1]
391447
if (
392448
1
393-
and abs(v["bbox"][1] - last_v["bbox"][1]) < 1
394-
and abs(v["bbox"][3] - last_v["bbox"][3]) < 1
449+
and abs(v["bbox"][1] - last_v["bbox"][1]) < Y_TOLERANCE
450+
and abs(v["bbox"][3] - last_v["bbox"][3]) < Y_TOLERANCE
395451
and v["bbox"][0] <= last_v["bbox"][2] + 1
396452
):
397453
# merge horizontally
@@ -408,7 +464,14 @@ def simplify_vectors(vectors):
408464

409465

410466
def find_virtual_lines(page, table_bbox, words, vectors, link_rects):
411-
"""Return virtual lines for a given table bbox."""
467+
"""Return virtual lines for a given table bbox.
468+
469+
This utility looks for:
470+
* horizontal non-stroke vectors and uses their top and bottom edges
471+
as virtual lines. Should work for tables with alternating row colors.
472+
* horizontal thin lines and uses their left x-coordinate as column
473+
borders.
474+
"""
412475

413476
def make_vertical(table_bbox, line_bbox, word_boxes):
414477
# default top and bottom point of vertical line
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Generated file - do not edit.
22
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
3-
VERSION = '0.1.9'
3+
VERSION = '0.2.0'

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"Topic :: Utilities",
1515
]
1616

17-
version = "0.1.9"
17+
version = "0.2.0"
1818
requires = ["pymupdf>=1.26.6", "tabulate"]
1919

2020
text = requires[0].split("=")[1]

0 commit comments

Comments
 (0)