Merge pull request #108 from pymupdf/v0.0.11

JorjMcKie · web-flow · commit acc4001e19dc · 2024-08-22T12:45:46.000-04:00
Some fixes
diff --git a/docs/src/changes.rst b/docs/src/changes.rst
@@ -0,0 +1,103 @@
+.. include:: header.rst
+
+
+Change Log
+===========================================================================
+
+Changes in version 0.0.11
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `90 <https://github.com/pymupdf/RAG/issues/90>`_ "'Quad' object has no attribute 'tl'"
+* `88 <https://github.com/pymupdf/RAG/issues/88>`_ "Bug in is_significant function"
+
+
+Improvements:
+~~~~~~~~~~~~~~
+* Extended the list of known bullet point characters.
+
+
+Changes in version 0.0.10
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `73 <https://github.com/pymupdf/RAG/issues/73>`_ "bug in to_markdown internal function"
+* `74 <https://github.com/pymupdf/RAG/issues/74>`_ "minimum area for images & vector graphics"
+* `75 <https://github.com/pymupdf/RAG/issues/75>`_ "Poor Markdown Generation for Particular PDF"
+* `76 <https://github.com/pymupdf/RAG/issues/76>`_ "suggestion on useful api parameters"
+
+
+Improvements:
+~~~~~~~~~~~~~~
+* Improved recognition of "insignificant" vector graphics. Graphics like text highlights or borders will be ignored.
+* The format of saved images can now be controlled via new parameter `image_format`.
+* Images can be stored in a specific folder via the new parameter `image_path`.
+* Images are **not stored if contained** in another image on same page.
+* Images are **not stored if too small:** if width or height are less than 5% of corresponding page dimension.
+* All text is always written. If `write_images=True`, text on images / graphics can be suppressed by setting `force_text=False`.
+
+
+Changes in version 0.0.9
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `71 <https://github.com/pymupdf/RAG/issues/71>`_ "Unexpected results in pymupdf4llm but pymupdf works"
+* `68 <https://github.com/pymupdf/RAG/issues/68>`_ "Issue with text extraction near footer of page"
+
+
+Improvements:
+~~~~~~~~~~~~~~
+* Improved identification of scattered text span particles. This should address most issues with out-of-sequence situations.
+* We now correctly process rotated pages (see issue #68).
+
+
+Changes in version 0.0.8
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `65 <https://github.com/pymupdf/RAG/issues/65>`_ Fix typo in `pymupdf_rag.py`.
+
+
+Changes in version 0.0.7
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `54 <https://github.com/pymupdf/RAG/issues/54>`_ "Mistakes in orchestrating sentences". Additional fix: text extraction no longer uses the TEXT_DEHYPHNATE flag bit.
+
+Improvements:
+~~~~~~~~~~~~~~~~
+
+* Improved the algorithm dealing with vector graphics. Vector graphics are now more reliably classified as irrelevant: We now detect when "strokes" only exist in the neighborhood of the graphics boundary box border itself. This is quite often the case for code snippets.
+
+
+Changes in version 0.0.6
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `55 <https://github.com/pymupdf/RAG/issues/55>`_ "Bug in helpers/multi_column.py - IndexError: list index out of range"
+* `54 <https://github.com/pymupdf/RAG/issues/54>`_ "Mistakes in orchestrating sentences"
+* `52 <https://github.com/pymupdf/RAG/issues/52>`_ "Chunking of text files"
+* Partial fix for `41 <https://github.com/pymupdf/RAG/issues/41>`_ / `40 <https://github.com/pymupdf/RAG/issues/40>`_. Improved page column detection, but still no silver bullet for overly complex page layouts.
+
+Improvements:
+~~~~~~~~~~~~~~~~
+
+* New parameter `dpi` to specify the resolution of images.
+* New parameters `page_width` / `page_height` for easily processing reflowable documents (Text, Office, e-books).
+* New parameter `graphics_limit` to avoid spending runtimes for value-less content.
+* New parameter `table_strategy` to directly control the table detection strategy.
+
+.. include:: footer.rst
+
diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -69,7 +69,9 @@ def sanitize_spans(line):
         Returns:
             A list of sorted, and potentially cleaned-up spans
         """
-        line.sort(key=lambda s: s["bbox"].x0)  # sort left to right
+        # sort ascending horizontally
+        line.sort(key=lambda s: s["bbox"].x0)
+        # join spans, delete duplicates
         for i in range(len(line) - 1, 0, -1):  # iterate back to front
             s0 = line[i - 1]
             s1 = line[i]
@@ -78,13 +80,17 @@ def sanitize_spans(line):
             delta = s1["size"] * 0.1
             if s0["bbox"].x1 + delta < s1["bbox"].x0:
                 continue  # all good: no joining neded
+
+            # We need to join bbox and text of two consecutive spans
+            # On occasion, spans may also be duplicated.
+            if s0["text"] != s1["text"] or s0["bbox"] != s1["bbox"]:
+                s0["text"] += s1["text"]
             s0["bbox"] |= s1["bbox"]  # join boundary boxes
-            s0["text"] += s1["text"]  # join the text
             del line[i]  # delete the joined-in span
             line[i - 1] = s0  # update the span
         return line
 
-    if clip is None:  # use TextPage if not provided
+    if clip is None:  # use TextPage rect if not provided
         clip = textpage.rect
     # extract text blocks - if bbox is not empty
     blocks = [
@@ -126,10 +132,7 @@ def sanitize_spans(line):
         sbbox = s["bbox"]  # this bbox
         sbbox0 = line[-1]["bbox"]  # previous bbox
         # if any of top or bottom coordinates are close enough, join...
-        if (
-            abs(sbbox.y1 - sbbox0.y1) <= y_delta
-            or abs(sbbox.y0 - sbbox0.y0) <= y_delta
-        ):
+        if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta:
             line.append(s)  # append to this line
             lrect |= sbbox  # extend line rectangle
             continue
@@ -150,9 +153,7 @@ def sanitize_spans(line):
     return nlines
 
 
-def get_text_lines(
-    page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False
-):
+def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False):
     """Extract text by line keeping natural reading sequence.
 
     Notes:
diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py
@@ -40,15 +40,15 @@
 if fitz.pymupdf_version_tuple < (1, 24, 2):
     raise NotImplementedError("PyMuPDF version 1.24.2 or later is needed.")
 
-bullet = (
+bullet = [
     "- ",
     "* ",
     chr(0xF0A7),
     chr(0xF0B7),
     chr(0xB7),
     chr(8226),
-    chr(9679),
-)
+] + list(map(chr, range(9642, 9680)))
+
 GRAPHICS_TEXT = "\n![](%s)\n"
 
 
@@ -193,7 +193,7 @@ def is_significant(box, paths):
         for itm in p["items"]:
             if itm[0] in ("l", "c"):  # line or curve
                 points.extend(itm[1:])  # append all the points
-            elif itm[0] == "q":  # quad
+            elif itm[0] == "qu":  # quad
                 q = itm[1]
                 # follow corners anti-clockwise
                 points.extend([q.ul, q.ll, q.lr, q.ur, q.ul])