Skip to content

Commit d5df039

Browse files
authored
Merge pull request #123 from pymupdf/v0.0.13
Version 0.0.13
2 parents 78952f1 + 1e0f226 commit d5df039

File tree

7 files changed

+116
-117
lines changed

7 files changed

+116
-117
lines changed

docs/src/changes.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,22 @@
44
Change Log
55
===========================================================================
66

7+
Changes in version 0.0.13
8+
--------------------------
9+
10+
Fixes:
11+
~~~~~~~
12+
13+
* `112 <https://github.com/pymupdf/RAG/issues/112>`_ "Invalid bandwriter header dimensions/setup"
14+
15+
16+
Improvements:
17+
~~~~~~~~~~~~~~
18+
* New parameter `ignore_code` suppresses special formatting of text in mono-spaced fonts.
19+
* New parameter `extract_words` enforces `page_chunks=True` and adds a "words" list to each page dictionary.
20+
21+
22+
723
Changes in version 0.0.11
824
--------------------------
925

pymupdf4llm/pymupdf4llm/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
22

3-
__version__ = "0.0.12"
3+
__version__ = "0.0.13"
44
version = __version__
55
version_tuple = tuple(map(int, version.split(".")))
66

pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,7 @@
1515
import string
1616
import sys
1717

18-
try:
19-
import pymupdf as fitz # available with v1.24.3
20-
except ImportError:
21-
import fitz
18+
import pymupdf
2219

2320
WHITE = set(string.whitespace)
2421

@@ -96,13 +93,13 @@ def sanitize_spans(line):
9693
blocks = [
9794
b
9895
for b in textpage.extractDICT()["blocks"]
99-
if b["type"] == 0 and not fitz.Rect(b["bbox"]).is_empty
96+
if b["type"] == 0 and not pymupdf.Rect(b["bbox"]).is_empty
10097
]
10198
spans = [] # all spans in TextPage here
10299
for bno, b in enumerate(blocks): # the numbered blocks
103100
for lno, line in enumerate(b["lines"]): # the numbered lines
104101
for sno, s in enumerate(line["spans"]): # the numered spans
105-
sbbox = fitz.Rect(s["bbox"]) # span bbox as a Rect
102+
sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect
106103
mpoint = (sbbox.tl + sbbox.br) / 2 # middle point
107104
if mpoint not in clip:
108105
continue
@@ -165,16 +162,16 @@ def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr
165162
cases of text replaced by way of redaction annotations.
166163
167164
Args:
168-
page: (fitz.Page)
165+
page: (pymupdf.Page)
169166
textpage: (TextPage) if None a temporary one is created.
170167
clip: (rect-like) only consider spans inside this area
171168
sep: (str) use this string when joining multiple MuPDF lines.
172169
Returns:
173170
String of plain text in reading sequence.
174171
"""
175-
textflags = fitz.TEXT_MEDIABOX_CLIP
172+
textflags = pymupdf.TEXT_MEDIABOX_CLIP
176173
page.remove_rotation()
177-
prect = page.rect if not clip else fitz.Rect(clip) # area to consider
174+
prect = page.rect if not clip else pymupdf.Rect(clip) # area to consider
178175

179176
xsep = sep if sep == "|" else ""
180177

@@ -255,7 +252,7 @@ def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr
255252
import pathlib
256253

257254
filename = sys.argv[1]
258-
doc = fitz.open(filename)
255+
doc = pymupdf.open(filename)
259256
text = ""
260257
for page in doc:
261258
text += get_text_lines(page, sep=" ") + "\n" + chr(12) + "\n"

pymupdf4llm/pymupdf4llm/helpers/multi_column.py

Lines changed: 8 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
# for each page execute
4444
bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
4545
46-
bboxes is a list of fitz.IRect objects, that are sorted ascending by their
46+
bboxes is a list of pymupdf.IRect objects, that are sorted ascending by their
4747
y0, then x0 coordinates. Their text content can be extracted by all PyMuPDF
4848
get_text() variants, like for instance the following:
4949
for rect in bboxes:
@@ -62,10 +62,7 @@
6262

6363
import string
6464

65-
try:
66-
import pymupdf as fitz
67-
except ImportError:
68-
import fitz
65+
import pymupdf
6966

7067

7168
def column_boxes(
@@ -103,7 +100,7 @@ def is_white(text):
103100
paths = page.get_drawings()
104101

105102
if textpage is None:
106-
textpage = page.get_textpage(clip=clip, flags=fitz.TEXTFLAGS_TEXT)
103+
textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT)
107104

108105
bboxes = []
109106

@@ -151,44 +148,6 @@ def can_extend(temp, bb, bboxlist, vert_bboxes):
151148

152149
return True
153150

154-
# def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
155-
# """Extend a bbox to the right page border.
156-
157-
# Whenever there is no text to the right of a bbox, enlarge it up
158-
# to the right page border.
159-
160-
# Args:
161-
# bboxes: (list[IRect]) bboxes to check
162-
# width: (int) page width
163-
# path_bboxes: (list[IRect]) bboxes with a background color
164-
# vert_bboxes: (list[IRect]) bboxes with vertical text
165-
# img_bboxes: (list[IRect]) bboxes of images
166-
# Returns:
167-
# Potentially modified bboxes.
168-
# """
169-
# for i, bb in enumerate(bboxes):
170-
# # do not extend text with background color
171-
# if in_bbox(bb, path_bboxes):
172-
# continue
173-
174-
# # do not extend text in images
175-
# if in_bbox(bb, img_bboxes):
176-
# continue
177-
178-
# # temp extends bb to the right page border
179-
# temp = +bb
180-
# temp.x1 = width
181-
182-
# # do not cut through colored background or images
183-
# if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
184-
# continue
185-
186-
# # also, do not intersect other text bboxes
187-
# check = can_extend(temp, bb, bboxes, vert_bboxes)
188-
# if check:
189-
# bboxes[i] = temp # replace with enlarged bbox
190-
191-
# return [b for b in bboxes if b != None]
192151

193152
def join_rects_phase1(bboxes):
194153
"""Postprocess identified text blocks, phase 1.
@@ -336,7 +295,7 @@ def clean_nblocks(nblocks):
336295

337296
# Make block rectangles, ignoring non-horizontal text
338297
for b in blocks:
339-
bbox = fitz.IRect(b["bbox"]) # bbox of the block
298+
bbox = pymupdf.IRect(b["bbox"]) # bbox of the block
340299

341300
# ignore text written upon images
342301
if no_image_text and in_bbox(bbox, img_bboxes):
@@ -352,9 +311,9 @@ def clean_nblocks(nblocks):
352311
vert_bboxes.append(bbox)
353312
continue
354313

355-
srect = fitz.EMPTY_IRECT()
314+
srect = pymupdf.EMPTY_IRECT()
356315
for line in b["lines"]:
357-
lbbox = fitz.IRect(line["bbox"])
316+
lbbox = pymupdf.IRect(line["bbox"])
358317
text = "".join([s["text"].strip() for s in line["spans"]])
359318
if len(text) > 1:
360319
srect |= lbbox
@@ -435,7 +394,7 @@ def clean_nblocks(nblocks):
435394
"""
436395
import sys
437396

438-
RED = fitz.pdfcolor["red"]
397+
RED = pymupdf.pdfcolor["red"]
439398
# get the file name
440399
filename = sys.argv[1]
441400

@@ -452,7 +411,7 @@ def clean_nblocks(nblocks):
452411
header_margin = 50
453412

454413
# open document
455-
doc = fitz.open(filename)
414+
doc = pymupdf.open(filename)
456415

457416
# iterate over the pages
458417
for page in doc:

0 commit comments

Comments
 (0)