Merge pull request #242 from pymupdf/Version-0.0.20

JorjMcKie · web-flow · commit 266ee8279ddc · 2025-04-04T15:41:00.000-04:00
Changes Version 0.0.20
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,23 @@
 # Change Log
 
+## Changes in version 0.0.20
+
+### Fixes:
+
+* [171](https://github.com/pymupdf/RAG/issues/171) - Text rects overlap with tables and images that should be excluded.
+* [189](https://github.com/pymupdf/RAG/issues/189) - The position of the extracted image is incorrect
+* [238](https://github.com/pymupdf/RAG/issues/238) - When text is laid out around the picture, text extraction is missing.
+
+### Other Changes:
+
+* Added **_new parameter_** `ignore_images`: (bool) optional. `True` will not consider images in any way. May be useful for pages where a plethora of images prevents meaningful layout analysis. Typical examples are PowerPoint slides and derived / similar pages.
+
+* Added **_new parameter_** `ignore_graphics`: (bool), optional. `True` will not consider graphics except for table detection. May be useful for pages where a plethora of vector graphics prevents meaningful layout analysis. Typical examples are PowerPoint slides and derived / similar pages.
+
+* Added **_new parameter_** to class `IdentifyHeaders`: Use `max_levels` (integer <= 6) to limit the generation of header tag levels. e.g. `headers = pymupdf4llm.IdentifyHeaders(doc, max_level=3)` ensures that only up to 3 header levels will ever be generated. Any text with a font size less than the value of `###` will be body text. In this case, the markdown generation itself would be coded as `md = pymupdf4llm.to_markdown(doc, hdr_info=headers, ...)`.
+
+* Changed parameter `table_strategy`: When specifying `None`, no effort to detecting tables will be made. This can be useful when tables are of no interest or known to not exist in a given file. This will speed up processing significantly. Be prepared to see more changes and extensions here.
+
 
 ## Changes in version 0.0.19
 
diff --git a/pdf4llm/setup.py b/pdf4llm/setup.py
@@ -13,11 +13,11 @@
     "Programming Language :: Python :: 3",
     "Topic :: Utilities",
 ]
-requires = ["pymupdf4llm>=0.0.19"]
+requires = ["pymupdf4llm==0.0.20"]
 
 setuptools.setup(
     name="pdf4llm",
-    version="0.0.19",
+    version="0.0.20",
     author="Artifex",
     author_email="support@artifex.com",
     description="PyMuPDF Utilities for LLM/RAG",
diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.19"
+__version__ = "0.0.20"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 
diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py
@@ -76,6 +76,7 @@ def column_boxes(
     textpage=None,
     paths=None,
     avoid=None,
+    ignore_images=False,
 ):
     """Determine bboxes which wrap a column on the page.
 
@@ -261,7 +262,9 @@ def join_rects_phase3(bboxes, path_rects, cache):
                         continue
 
                     # do not join different backgrounds
-                    if in_bbox_using_cache(prect0, path_rects, cache) != in_bbox_using_cache(prect1, path_rects, cache):
+                    if in_bbox_using_cache(
+                        prect0, path_rects, cache
+                    ) != in_bbox_using_cache(prect1, path_rects, cache):
                         continue
                     temp = prect0 | prect1
                     test = set(
@@ -333,11 +336,12 @@ def join_rects_phase3(bboxes, path_rects, cache):
     clip.y1 -= footer_margin  # Remove footer area
     clip.y0 += header_margin  # Remove header area
 
-    paths = [
-        p
-        for p in page.get_drawings()
-        if p["rect"].width < clip.width and p["rect"].height < clip.height
-    ]
+    if paths is None:
+        paths = [
+            p
+            for p in page.get_drawings()
+            if p["rect"].width < clip.width and p["rect"].height < clip.height
+        ]
 
     if textpage is None:
         textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT)
@@ -371,8 +375,9 @@ def join_rects_phase3(bboxes, path_rects, cache):
     path_rects.sort(key=lambda b: (b.y0, b.x0))
 
     # bboxes of images on page, no need to sort them
-    for item in page.get_images():
-        img_bboxes.extend(page.get_image_rects(item[0]))
+    if ignore_images is False:
+        for item in page.get_images():
+            img_bboxes.extend(page.get_image_rects(item[0]))
 
     # blocks of text on page
     blocks = textpage.extractDICT()["blocks"]
@@ -433,7 +438,9 @@ def join_rects_phase3(bboxes, path_rects, cache):
                 continue
 
             # never join across different background colors
-            if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache(bb, path_rects, cache):
+            if in_bbox_using_cache(nbb, path_rects, cache) != in_bbox_using_cache(
+                bb, path_rects, cache
+            ):
                 continue
 
             temp = bb | nbb  # temporary extension of new block
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py