@@ -171,10 +171,10 @@ class TocHeaders:
171171 full document to identify font sizes, it uses the document's Table Of
172172 Contents (TOC) to identify headers on pages.
173173 Like IdentifyHeaders, this also is no guarantee to find headers, but it
174- is a good change for appropriately build documents. In such cases, this
175- method can be very much faster and more accurate, because we can use the
176- hierarchy level of TOC items directly to ientify the header level.
177- Examples where this approach works very well are the Adobe PDF documents.
174+ represents a good chance for appropriately built documents. In such cases,
175+ this method can be very much faster and more accurate, because we can
176+ directly use the hierarchy level of TOC items to ientify the header level.
177+ Examples where this works very well are the Adobe PDF documents.
178178 """
179179
180180 def __init__ (self , doc : str ):
@@ -195,14 +195,15 @@ def get_header_id(self, span: dict, page=None) -> str:
195195 Given a text span from a "dict"/"rawdict" extraction, determine the
196196 markdown header prefix string of 0 to n concatenated '#' characters.
197197 """
198- if page is None :
198+ if not page :
199199 return ""
200200 # check if this page has TOC entries with an actual title
201201 my_toc = [t for t in self .TOC if t [1 ] and t [- 1 ] == page .number + 1 ]
202- if not my_toc :
202+ if not my_toc : # no TOC items present on this page
203203 return ""
204- # check if the span matches a TOC entry
205- text = span ["text" ].strip ()
204+ # Check if the span matches a TOC entry. This must be done in the
205+ # most forgiving way: exact matches are rare animals.
206+ text = span ["text" ].strip () # remove leading and trailing whitespace
206207 for t in my_toc :
207208 title = t [1 ].strip () # title of TOC entry
208209 lvl = t [0 ] # level of TOC entry
@@ -321,6 +322,7 @@ def to_markdown(
321322 extract_words = False ,
322323 show_progress = False ,
323324 use_glyphs = False ,
325+ ignore_alpha = False ,
324326) -> str :
325327 """Process the document and return the text of the selected pages.
326328
@@ -341,9 +343,10 @@ def to_markdown(
341343 table_strategy: choose table detection strategy
342344 graphics_limit: (int) if vector graphics count exceeds this, ignore all.
343345 ignore_code: (bool) suppress code-like formatting (mono-space fonts)
344- extract_words: (bool) include "words"-like output in page chunks
345- show_progress: (bool) print progress as each page is processed.
346- use_glyphs: (bool) replace the Invalid Unicode by glyph numbers.
346+ extract_words: (bool, False) include "words"-like output in page chunks
347+ show_progress: (bool, False) print progress as each page is processed.
348+ use_glyphs: (bool, False) replace the Invalid Unicode by glyph numbers.
349+ ignore_alpha: (bool, True) ignore text with alpha = 0 (transparent).
347350
348351 """
349352 if write_images is False and embed_images is False and force_text is False :
@@ -372,6 +375,8 @@ def to_markdown(
372375 FONTSIZE_LIMIT = fontsize_limit
373376 IGNORE_IMAGES = ignore_images
374377 IGNORE_GRAPHICS = ignore_graphics
378+ if doc .is_form_pdf or doc .has_annots ():
379+ doc .bake ()
375380
376381 # for reflowable documents allow making 1 page for the whole document
377382 if doc .is_reflowable :
@@ -394,7 +399,7 @@ def to_markdown(
394399 margins = (0 , margins [0 ], 0 , margins [1 ])
395400 if len (margins ) != 4 :
396401 raise ValueError ("margins must be one, two or four floats" )
397- elif not all ([ hasattr (m , "__float__" ) for m in margins ] ):
402+ elif not all (hasattr (m , "__float__" ) for m in margins ):
398403 raise ValueError ("margin values must be floats" )
399404
400405 # If "hdr_info" is not an object with a method "get_header_id", scan the
@@ -587,44 +592,28 @@ def write_text(
587592 # make text string for the full line
588593 text = " " .join ([s ["text" ] for s in spans ])
589594
590- # if line is a header, this will return multiple "#" characters,
591- # otherwise an empty string
592- hdr_string = max_header_id (spans , page = parms .page ) # a header?
593-
594595 # full line strikeout?
595596 all_strikeout = all ([s ["char_flags" ] & 1 for s in spans ])
596597 # full line italic?
597598 all_italic = all ([s ["flags" ] & 2 for s in spans ])
598599 # full line bold?
599- all_bold = all ([s ["flags" ] & 16 or s ["char_flags" ] & 8 for s in spans ])
600-
600+ all_bold = all ([(s ["flags" ] & 16 ) or (s ["char_flags" ] & 8 ) for s in spans ])
601601 # full line mono-spaced?
602- if not IGNORE_CODE :
603- all_mono = all ([s ["flags" ] & 8 for s in spans ])
604- else :
605- all_mono = False
602+ all_mono = all ([s ["flags" ] & 8 for s in spans ])
606603
607- if all_mono and not hdr_string :
608- if not code : # if not already in code output mode:
609- out_string += "```\n " # switch on "code" mode
610- code = True
611- # compute approx. distance from left - assuming a width
612- # of 0.5*fontsize.
613- delta = int ((lrect .x0 - clip .x0 ) / (spans [0 ]["size" ] * 0.5 ))
614- indent = " " * delta
615-
616- out_string += indent + text + "\n "
617- continue # done with this line
604+ # if line is a header, this will return multiple "#" characters,
605+ # otherwise an empty string
606+ hdr_string = max_header_id (spans , page = parms .page ) # a header?
618607
619608 if hdr_string : # if a header line skip the rest
620609 if all_mono :
621610 text = "`" + text + "`"
622- if all_strikeout :
623- text = "~~" + text + "~~"
624611 if all_italic :
625- text = "* " + text + "* "
612+ text = "_ " + text + "_ "
626613 if all_bold :
627614 text = "**" + text + "**"
615+ if all_strikeout :
616+ text = "~~" + text + "~~"
628617 if hdr_string != prev_hdr_string :
629618 out_string += hdr_string + text + "\n "
630619 else :
@@ -637,6 +626,23 @@ def write_text(
637626
638627 prev_hdr_string = hdr_string
639628
629+ # start or extend a code block
630+ if all_mono and not IGNORE_CODE :
631+ if not code : # if not already in code output mode:
632+ out_string += "```\n " # switch on "code" mode
633+ code = True
634+ # compute approx. distance from left - assuming a width
635+ # of 0.5*fontsize.
636+ delta = int ((lrect .x0 - clip .x0 ) / (spans [0 ]["size" ] * 0.5 ))
637+ indent = " " * delta
638+
639+ out_string += indent + text + "\n "
640+ continue # done with this line
641+
642+ if code and not all_mono :
643+ out_string += "```\n " # switch off code mode
644+ code = False
645+
640646 span0 = spans [0 ]
641647 bno = span0 ["block" ] # block number of line
642648 if bno != prev_bno :
@@ -660,30 +666,30 @@ def write_text(
660666
661667 for i , s in enumerate (spans ): # iterate spans of the line
662668 # decode font properties
663- mono = s ["flags" ] & 8 and IGNORE_CODE is False
669+ mono = s ["flags" ] & 8
664670 bold = s ["flags" ] & 16 or s ["char_flags" ] & 8
665671 italic = s ["flags" ] & 2
666672 strikeout = s ["char_flags" ] & 1
667673
668- if mono :
669- # this is text in some monospaced font
670- out_string += f"`{ s ['text' ].strip ()} ` "
671- continue
674+ # if mono:
675+ # # this is text in some monospaced font
676+ # out_string += f"`{s['text'].strip()}` "
677+ # continue
672678
673679 prefix = ""
674680 suffix = ""
681+ if mono :
682+ prefix = "`" + prefix
683+ suffix += "`"
675684 if bold :
676685 prefix = "**" + prefix
677686 suffix += "**"
678687 if italic :
679- prefix = "* " + prefix
680- suffix += "* "
688+ prefix = "_ " + prefix
689+ suffix += "_ "
681690 if strikeout :
682691 prefix = "~~" + prefix
683692 suffix += "~~"
684- if mono :
685- prefix = "`" + prefix
686- suffix += "`"
687693
688694 # convert intersecting link to markdown syntax
689695 ltext = resolve_links (parms .links , s )
@@ -831,9 +837,12 @@ def page_is_ocr(page):
831837
832838 For this to be true, all text must be written as "ignore-text".
833839 """
834- text_types = set ([b [0 ] for b in page .get_bboxlog () if "text" in b [0 ]])
835- if text_types == {"ignore-text" }:
836- return True
840+ try :
841+ text_types = set ([b [0 ] for b in page .get_bboxlog () if "text" in b [0 ]])
842+ if text_types == {"ignore-text" }:
843+ return True
844+ except :
845+ pass
837846 return False
838847
839848 def get_bg_color (page ):
@@ -934,7 +943,9 @@ def get_page_output(
934943 parms .graphics = []
935944 parms .words = []
936945 parms .line_rects = []
937- parms .accept_invisible = page_is_ocr (page ) # accept invisible text
946+ parms .accept_invisible = (
947+ page_is_ocr (page ) or ignore_alpha
948+ ) # accept invisible text
938949
939950 # determine background color
940951 parms .bg_color = get_bg_color (page )
@@ -958,6 +969,8 @@ def get_page_output(
958969 img_info = []
959970 for i in range (len (img_info )):
960971 img_info [i ]["bbox" ] = pymupdf .Rect (img_info [i ]["bbox" ])
972+
973+ # filter out images that are too small or outside the clip
961974 img_info = [
962975 i
963976 for i in img_info
@@ -967,8 +980,19 @@ def get_page_output(
967980 and i ["bbox" ].width > 3
968981 and i ["bbox" ].height > 3
969982 ]
983+
970984 # sort descending by image area size
971985 img_info .sort (key = lambda i : abs (i ["bbox" ]), reverse = True )
986+
987+ # subset of images truly inside the clip
988+ sane = [i for i in img_info if parms .clip not in i ["bbox" ].irect ]
989+ if len (sane ) < len (img_info ): # found some
990+ img_info = sane # use those images instead
991+ # output full page image
992+ name = save_image (parms , parms .clip , "full" )
993+ if name :
994+ parms .md_string += GRAPHICS_TEXT % name
995+
972996 img_info = img_info [:30 ] # only accept the largest up to 30 images
973997 # run from back to front (= small to large)
974998 for i in range (len (img_info ) - 1 , 0 , - 1 ):
@@ -1152,7 +1176,7 @@ def get_page_output(
11521176 0
11531177 | mupdf .FZ_STEXT_CLIP
11541178 | mupdf .FZ_STEXT_ACCURATE_BBOXES
1155- | mupdf .FZ_STEXT_IGNORE_ACTUALTEXT
1179+ # | mupdf.FZ_STEXT_IGNORE_ACTUALTEXT
11561180 | 32768 # mupdf.FZ_STEXT_COLLECT_STYLES
11571181 )
11581182 # optionally replace 0xFFFD by glyph number
@@ -1253,7 +1277,7 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
12531277 import time
12541278
12551279 try :
1256- filename = "sample_document.pdf"
1280+ filename = sys . argv [ 1 ]
12571281 except IndexError :
12581282 print (f"Usage:\n python { os .path .basename (__file__ )} input.pdf" )
12591283 sys .exit ()
@@ -1284,11 +1308,6 @@ def extract_images_on_page_simple_drop(page, parms, image_size_limit):
12841308 md_string = to_markdown (
12851309 doc ,
12861310 pages = pages ,
1287- # write_images=True,
1288- force_text = True ,
1289- ignore_images = True ,
1290- ignore_graphics = True ,
1291- table_strategy = None ,
12921311 )
12931312 FILENAME = doc .name
12941313 # output to a text file with extension ".md"
0 commit comments