Skip to content

Commit 70c90a4

Browse files
authored
Merge pull request #313 from pymupdf/v0.1.8
Version 0.1.8
2 parents 3e17b82 + b796b75 commit 70c90a4

File tree

6 files changed

+52
-19
lines changed

6 files changed

+52
-19
lines changed

pdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
with open(os.path.join(setup_py_cwd, "README.md"), encoding="utf-8") as f:
77
readme = f.read()
88

9-
version = "0.1.7"
9+
version = "0.1.8"
1010

1111
classifiers = [
1212
"Development Status :: 5 - Production/Stable",

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
version = VERSION
1515
version_tuple = tuple(map(int, version.split(".")))
1616

17-
if not callable(pymupdf._get_layout):
17+
if pymupdf._get_layout is None:
1818
from .helpers.pymupdf_rag import IdentifyHeaders, TocHeaders, to_markdown
1919

2020
pymupdf._warn_layout_once() # recommend pymupdf_layout

pymupdf4llm/pymupdf4llm/helpers/document_layout.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -764,23 +764,31 @@ def parse_document(
764764
if pymupdf.table._iou(tab.bbox, clip) > 0.6
765765
][0]
766766
cells = [[c for c in row.cells] for row in table.rows]
767-
767+
row_count = table.row_count
768768
if table.header.external: # if the header ioutside table
769769
cells.insert(0, table.header.cells) # insert a row
770-
table.row_count += 1 # increase row count
770+
row_count += 1 # increase row count
771771

772772
layoutbox.table = {
773773
"bbox": list(table.bbox),
774-
"row_count": table.row_count,
774+
"row_count": row_count,
775775
"col_count": table.col_count,
776776
"cells": cells,
777-
"extract": table.extract(),
778777
}
778+
779+
layoutbox.table["extract"] = utils.table_extract(
780+
textpage,
781+
layoutbox,
782+
)
783+
779784
layoutbox.table["markdown"] = utils.table_to_markdown(
780-
textpage, layoutbox, markdown=True
785+
textpage,
786+
layoutbox,
787+
markdown=True,
781788
)
789+
782790
except Exception as e:
783-
print(f"table detection error '{e}'")
791+
print(f"table detection error '{e}' on page {page.number+1}")
784792
# table structure not detected: treat like an image
785793
pix = page.get_pixmap(clip=clip, dpi=document.image_dpi)
786794
layoutbox.image = pix.tobytes(document.image_format)

pymupdf4llm/pymupdf4llm/helpers/utils.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -514,25 +514,34 @@ def extract_cells(textpage, cell, markdown=False):
514514
Returns:
515515
A string with the text extracted from the cell.
516516
"""
517+
518+
def outside_cell(bbox, cell):
519+
return (
520+
0
521+
or bbox[0] >= cell[2]
522+
or bbox[2] <= cell[0]
523+
or bbox[1] >= cell[3]
524+
or bbox[3] <= cell[1]
525+
)
526+
517527
text = ""
518528
for block in textpage.extractRAWDICT()["blocks"]:
519529
if block["type"] != 0:
520530
continue
531+
if outside_cell(block["bbox"], cell):
532+
continue
521533
for line in block["lines"]:
522-
new_line = True
534+
if outside_cell(line["bbox"], cell):
535+
continue
523536
if text: # must be a new line in the cell
524-
if text.endswith("$"):
525-
text += " "
526-
elif text.endswith("$ "):
527-
pass
528-
else:
529-
text += "<br>" if markdown else "\n"
537+
text += "<br>" if markdown else "\n"
530538

531539
# strikeout detection only works with horizontal text
532540
horizontal = line["dir"] == (0, 1) or line["dir"] == (1, 0)
533541

534542
for span in line["spans"]:
535-
sbbox = span["bbox"]
543+
if outside_cell(span["bbox"], cell):
544+
continue
536545
# only include chars with more than 50% bbox overlap
537546
span_text = ""
538547
for char in span["chars"]:
@@ -576,7 +585,7 @@ def extract_cells(textpage, cell, markdown=False):
576585
text += " "
577586
else:
578587
text += prefix + span_text + suffix
579-
588+
text = text.replace("$<br>", "$ ").replace(" $ <br>", "$ ")
580589
return text.strip()
581590

582591

@@ -635,3 +644,19 @@ def table_to_markdown(textpage, table_item, markdown=True):
635644
line += "\n"
636645
output += line
637646
return output + "\n"
647+
648+
649+
def table_extract(textpage, table_item):
650+
table = table_item.table
651+
row_count = table["row_count"]
652+
col_count = table["col_count"]
653+
cell_boxes = table["cells"]
654+
# make empty cell text list
655+
cells = [[None for i in range(col_count)] for j in range(row_count)]
656+
657+
for i, row in enumerate(cell_boxes):
658+
for j, cell in enumerate(row):
659+
if cell is not None:
660+
cells[i][j] = extract_cells(textpage, cell_boxes[i][j], markdown=False)
661+
662+
return cells
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Generated file - do not edit.
22
MINIMUM_PYMUPDF_VERSION = (1, 26, 6)
3-
VERSION = '0.1.7'
3+
VERSION = '0.1.8'

pymupdf4llm/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"Topic :: Utilities",
1515
]
1616

17-
version = "0.1.7"
17+
version = "0.1.8"
1818
requires = ["pymupdf>=1.26.6", "tabulate"]
1919

2020
text = requires[0].split("=")[1]

0 commit comments

Comments
 (0)