@@ -210,19 +210,23 @@ def add_image_orphans(page, blocks):
210210"""
211211
212212
213- def cluster_stripes (boxes , vertical_gap : float = 12 ):
213+ def cluster_stripes (boxes , joined_boxes , vectors , vertical_gap = 12 ):
214214 """
215215 Divide page into horizontal stripes based on vertical gaps.
216216
217217 Args:
218- boxes (list): List of bounding boxes, each defined as (x0, y0, x1, y1) .
218+ boxes (list): List of bounding boxes.
219219 vertical_gap (float): Minimum vertical gap to separate stripes.
220220
221221 Returns:
222222 List of disjoint horizontal stripes. Each stripe is a list of boxes.
223223 """
224224
225225 def is_multi_column_layout (boxes ):
226+ """Check if the boxes have a clean multi-column layout.
227+
228+ Used to early exit from stripe clustering.
229+ """
226230 sorted_boxes = sorted (boxes , key = lambda b : b [0 ])
227231 columns = []
228232 current_column = [sorted_boxes [0 ]]
@@ -236,58 +240,86 @@ def is_multi_column_layout(boxes):
236240 columns .append (current_column )
237241 return len (columns ) > 1
238242
243+ def divider (y , box , vertical_gap ):
244+ """Create a rectangle of box width and vertical_gap height below y."""
245+ r = pymupdf .Rect (box [0 ], y , box [2 ], y + vertical_gap )
246+ return r
247+
239248 # Sort top to bottom
240- sorted_boxes = sorted (boxes , key = lambda b : b [1 ])
249+ sorted_boxes = sorted (boxes , key = lambda b : b [3 ])
241250 stripes = []
251+
252+ # exit if no boxes
242253 if not sorted_boxes :
243254 return stripes
244255
245- # Early exit for clean multi-column layouts
246- if is_multi_column_layout (sorted_boxes ):
256+ # Exit if clean multi-column layout: treat full page as single stripe.
257+ if is_multi_column_layout (boxes ):
247258 return [boxes ]
248259
249- current_stripe = [sorted_boxes [0 ]]
250-
251- for box in sorted_boxes [1 :]:
252- prev_bottom = max (b [3 ] for b in current_stripe )
253- if box [1 ] - prev_bottom > vertical_gap :
260+ # y-borders of horizontal stripes
261+ y_values = {joined_boxes .y1 }
262+ for box in sorted_boxes :
263+ # find empty horizontal dividers of minimum height 'vertical_gap'
264+ y = box [3 ]
265+ if y >= joined_boxes .y1 :
266+ continue
267+ div = divider (y , joined_boxes , vertical_gap )
268+ if not any (div .intersects (pymupdf .Rect (b [:4 ])) for b in boxes ):
269+ # look for next bbox below the divider
270+ y0 = min (b [1 ] for b in sorted_boxes if b [1 ] >= div .y1 )
271+ div .y1 = y0 # divider has this bottom now
272+ inter_count = 0 # counts intersections with vectors
273+
274+ # if divider is fully contained in more than one vector's stripe
275+ # we don't consider it.
276+ for vr in vectors :
277+ if div .intersects (vr ) and vr .y0 <= div .y0 and div .y1 <= vr .y1 :
278+ inter_count += 1
279+ if inter_count <= 1 :
280+ y_values .add (div .y1 )
281+ y_values = sorted (y_values )
282+ current_stripe = []
283+ for y in y_values :
284+ while sorted_boxes and sorted_boxes [0 ][3 ] <= y :
285+ current_stripe .append (sorted_boxes .pop (0 ))
286+ if current_stripe :
254287 stripes .append (current_stripe )
255- current_stripe = [box ]
256- else :
257- current_stripe .append (box )
258-
259- stripes .append (current_stripe )
288+ current_stripe = []
260289 return stripes
261290
262291
263- def cluster_columns_in_stripe (stripe : list ):
292+ def cluster_columns_in_stripe (stripe ):
264293 """
265294 Within a stripe, group boxes into columns based on horizontal proximity.
266295
296+ We use a small horizontal gap threshold to decide when a new column starts.
297+
267298 Args:
268- stripe (list): List of boxes within a stripe .
299+ stripe (list): List of boxes we look at here .
269300
270301 Returns:
271302 list: List of columns, each column is a list of boxes.
272303 """
304+ HORIZONTAL_GAP = 1 # allowable gap to start a new column
273305 # Sort left to right
274306 sorted_boxes = sorted (stripe , key = lambda b : b [0 ])
275307 columns = []
276308 current_column = [sorted_boxes [0 ]]
277309
278310 for box in sorted_boxes [1 :]:
279311 prev_right = max ([b [2 ] for b in current_column ])
280- if box [0 ] - prev_right > 1 :
281- columns .append (sorted (current_column , key = lambda b : b [3 ]))
312+ if box [0 ] - prev_right > HORIZONTAL_GAP :
313+ columns .append (sorted (current_column , key = lambda b : b [1 ]))
282314 current_column = [box ]
283315 else :
284316 current_column .append (box )
285317
286- columns .append (sorted (current_column , key = lambda b : b [3 ]))
318+ columns .append (sorted (current_column , key = lambda b : b [1 ]))
287319 return columns
288320
289321
290- def compute_reading_order (boxes , vertical_gap : float = 12 ):
322+ def compute_reading_order (boxes , joined_boxes , vectors , vertical_gap = 12 ):
291323 """
292324 Compute reading order of boxes delivered by PyMuPDF-Layout.
293325
@@ -298,12 +330,12 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
298330 Returns:
299331 list: List of boxes in reading order.
300332 """
301- # compute adequate vertical_gap based height of union of bboxes
302- temp = pymupdf . EMPTY_RECT ()
303- for b in boxes :
304- temp |= pymupdf . Rect ( b [: 4 ])
305- this_vertical_gap = vertical_gap * temp . height / 800
306- stripes = cluster_stripes ( boxes , vertical_gap = this_vertical_gap )
333+ stripes = cluster_stripes (
334+ boxes ,
335+ joined_boxes ,
336+ vectors ,
337+ vertical_gap = vertical_gap ,
338+ )
307339 ordered = []
308340 for stripe in stripes :
309341 columns = cluster_columns_in_stripe (stripe )
@@ -312,7 +344,7 @@ def compute_reading_order(boxes, vertical_gap: float = 12):
312344 return ordered
313345
314346
315- def find_reading_order (boxes , vertical_gap : float = 36 ) -> list :
347+ def find_reading_order (page_rect , blocks , boxes , vertical_gap : float = 12 ) -> list :
316348 """Given page layout information, return the boxes in reading order.
317349
318350 Args:
@@ -326,6 +358,9 @@ def find_reading_order(boxes, vertical_gap: float = 36) -> list:
326358 List of boxes in reading order.
327359 """
328360
361+ # compute adequate vertical_gap based on the height the page rectangle
362+ this_vertical_gap = vertical_gap * page_rect .height / 800
363+
329364 def is_contained (inner , outer ) -> bool :
330365 """Check if inner box is fully contained within outer box."""
331366 return (
@@ -369,9 +404,28 @@ def filter_contained(boxes) -> list:
369404 else :
370405 body_boxes .append (box )
371406
372- # bring body into reading order
373- ordered = compute_reading_order (body_boxes , vertical_gap = vertical_gap )
407+ # compute joined boxes of body
408+ joined_boxes = pymupdf .Rect (
409+ min (b [0 ] for b in body_boxes ),
410+ min (b [1 ] for b in body_boxes ),
411+ max (b [2 ] for b in body_boxes ),
412+ max (b [3 ] for b in body_boxes ),
413+ )
374414
415+ # extract vectors contained in the TextPage
416+ min_bbox_height = min (b [3 ] - b [1 ] for b in body_boxes )
417+ vectors = [
418+ pymupdf .Rect (b ["bbox" ])
419+ for b in blocks
420+ if b ["bbox" ][3 ] - b ["bbox" ][1 ] >= min_bbox_height and b ["bbox" ] in joined_boxes
421+ ]
422+ # bring body into reading order
423+ ordered = compute_reading_order (
424+ body_boxes ,
425+ joined_boxes ,
426+ vectors ,
427+ vertical_gap = this_vertical_gap ,
428+ )
375429 # Final full boxes list. We do simple sorts for non-body boxes.
376430 final = (
377431 sorted (page_headers , key = lambda r : (r [1 ], r [0 ]))
@@ -382,6 +436,8 @@ def filter_contained(boxes) -> list:
382436
383437
384438def simplify_vectors (vectors ):
439+ """Join vectors that are horizontally adjacent and vertically aligned."""
440+ Y_TOLERANCE = 1 # allowable top / bottom difference
385441 new_vectors = []
386442 if not vectors :
387443 return new_vectors
@@ -390,8 +446,8 @@ def simplify_vectors(vectors):
390446 last_v = new_vectors [- 1 ]
391447 if (
392448 1
393- and abs (v ["bbox" ][1 ] - last_v ["bbox" ][1 ]) < 1
394- and abs (v ["bbox" ][3 ] - last_v ["bbox" ][3 ]) < 1
449+ and abs (v ["bbox" ][1 ] - last_v ["bbox" ][1 ]) < Y_TOLERANCE
450+ and abs (v ["bbox" ][3 ] - last_v ["bbox" ][3 ]) < Y_TOLERANCE
395451 and v ["bbox" ][0 ] <= last_v ["bbox" ][2 ] + 1
396452 ):
397453 # merge horizontally
@@ -408,7 +464,14 @@ def simplify_vectors(vectors):
408464
409465
410466def find_virtual_lines (page , table_bbox , words , vectors , link_rects ):
411- """Return virtual lines for a given table bbox."""
467+ """Return virtual lines for a given table bbox.
468+
469+ This utility looks for:
470+ * horizontal non-stroke vectors and uses their top and bottom edges
471+ as virtual lines. Should work for tables with alternating row colors.
472+ * horizontal thin lines and uses their left x-coordinate as column
473+ borders.
474+ """
412475
413476 def make_vertical (table_bbox , line_bbox , word_boxes ):
414477 # default top and bottom point of vertical line
0 commit comments