4343 # for each page execute
4444 bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
4545
46- bboxes is a list of fitz .IRect objects, that are sorted ascending by their
46+ bboxes is a list of pymupdf .IRect objects, that are sorted ascending by their
4747 y0, then x0 coordinates. Their text content can be extracted by all PyMuPDF
4848 get_text() variants, like for instance the following:
4949 for rect in bboxes:
6262
6363import string
6464
65- try :
66- import pymupdf as fitz
67- except ImportError :
68- import fitz
65+ import pymupdf
6966
7067
7168def column_boxes (
@@ -103,7 +100,7 @@ def is_white(text):
103100 paths = page .get_drawings ()
104101
105102 if textpage is None :
106- textpage = page .get_textpage (clip = clip , flags = fitz .TEXTFLAGS_TEXT )
103+ textpage = page .get_textpage (clip = clip , flags = pymupdf .TEXTFLAGS_TEXT )
107104
108105 bboxes = []
109106
@@ -151,44 +148,6 @@ def can_extend(temp, bb, bboxlist, vert_bboxes):
151148
152149 return True
153150
154- # def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
155- # """Extend a bbox to the right page border.
156-
157- # Whenever there is no text to the right of a bbox, enlarge it up
158- # to the right page border.
159-
160- # Args:
161- # bboxes: (list[IRect]) bboxes to check
162- # width: (int) page width
163- # path_bboxes: (list[IRect]) bboxes with a background color
164- # vert_bboxes: (list[IRect]) bboxes with vertical text
165- # img_bboxes: (list[IRect]) bboxes of images
166- # Returns:
167- # Potentially modified bboxes.
168- # """
169- # for i, bb in enumerate(bboxes):
170- # # do not extend text with background color
171- # if in_bbox(bb, path_bboxes):
172- # continue
173-
174- # # do not extend text in images
175- # if in_bbox(bb, img_bboxes):
176- # continue
177-
178- # # temp extends bb to the right page border
179- # temp = +bb
180- # temp.x1 = width
181-
182- # # do not cut through colored background or images
183- # if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
184- # continue
185-
186- # # also, do not intersect other text bboxes
187- # check = can_extend(temp, bb, bboxes, vert_bboxes)
188- # if check:
189- # bboxes[i] = temp # replace with enlarged bbox
190-
191- # return [b for b in bboxes if b != None]
192151
193152 def join_rects_phase1 (bboxes ):
194153 """Postprocess identified text blocks, phase 1.
@@ -336,7 +295,7 @@ def clean_nblocks(nblocks):
336295
337296 # Make block rectangles, ignoring non-horizontal text
338297 for b in blocks :
339- bbox = fitz .IRect (b ["bbox" ]) # bbox of the block
298+ bbox = pymupdf .IRect (b ["bbox" ]) # bbox of the block
340299
341300 # ignore text written upon images
342301 if no_image_text and in_bbox (bbox , img_bboxes ):
@@ -352,9 +311,9 @@ def clean_nblocks(nblocks):
352311 vert_bboxes .append (bbox )
353312 continue
354313
355- srect = fitz .EMPTY_IRECT ()
314+ srect = pymupdf .EMPTY_IRECT ()
356315 for line in b ["lines" ]:
357- lbbox = fitz .IRect (line ["bbox" ])
316+ lbbox = pymupdf .IRect (line ["bbox" ])
358317 text = "" .join ([s ["text" ].strip () for s in line ["spans" ]])
359318 if len (text ) > 1 :
360319 srect |= lbbox
@@ -435,7 +394,7 @@ def clean_nblocks(nblocks):
435394 """
436395 import sys
437396
438- RED = fitz .pdfcolor ["red" ]
397+ RED = pymupdf .pdfcolor ["red" ]
439398 # get the file name
440399 filename = sys .argv [1 ]
441400
@@ -452,7 +411,7 @@ def clean_nblocks(nblocks):
452411 header_margin = 50
453412
454413 # open document
455- doc = fitz .open (filename )
414+ doc = pymupdf .open (filename )
456415
457416 # iterate over the pages
458417 for page in doc :
0 commit comments