Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/myhloli/Magic-PDF
Browse files Browse the repository at this point in the history
  • Loading branch information
liusilu committed Mar 27, 2024
2 parents e736062 + 7162deb commit fffee0a
Show file tree
Hide file tree
Showing 19 changed files with 2,350 additions and 342 deletions.
2 changes: 1 addition & 1 deletion demo/demo_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def get_json_from_local_or_s3(book_name=None):
s3_config = get_s3_config(json_path)
file_content = read_file(json_path, s3_config)
json_str = file_content.decode("utf-8")
logger.info(json_str)
# logger.info(json_str)
json_object = json.loads(json_str)
return json_object

Expand Down
97 changes: 57 additions & 40 deletions demo/ocr_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,16 @@
from loguru import logger
from pathlib import Path

from app.common.s3 import get_s3_config
from demo.demo_test import get_json_from_local_or_s3
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format
from magic_pdf.dict2md.ocr_mkcontent import (
ocr_mk_mm_markdown_with_para,
ocr_mk_nlp_markdown,
ocr_mk_mm_markdown,
ocr_mk_mm_standard_format,
ocr_mk_mm_markdown_with_para_and_pagination,
make_standard_format_with_para
)
from magic_pdf.libs.commons import join_path
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr

Expand Down Expand Up @@ -35,50 +43,59 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
pth = Path(ocr_json_file_path)
book_name = pth.name
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
save_path = join_path(save_tmp_path, "md")
save_path_with_bookname = os.path.join(save_path, book_name)
text_content_save_path = f"{save_path_with_bookname}/book.md"
pdf_info_dict = parse_pdf_by_ocr(
ocr_pdf_path,
None,
ocr_pdf_model_info,
save_path,
book_name,
debug_mode=True)

parent_dir = os.path.dirname(text_content_save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)

# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)

with open(text_content_save_path, "w", encoding="utf-8") as f:
f.write(markdown_content)

standard_format = ocr_mk_mm_standard_format(pdf_info_dict)
standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
with open(standard_format_save_path, "w", encoding="utf-8") as f:
f.write(str(standard_format))

# logger.info(markdown_content)
# save_markdown(markdown_text, ocr_json_file_path)
ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info)
except Exception as e:
logger.exception(e)


def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
json_object = get_json_from_local_or_s3(book_name)
logger.info(json_object)
try:
json_object = get_json_from_local_or_s3(book_name)
# logger.info(json_object)
s3_pdf_path = json_object["file_location"]
s3_config = get_s3_config(s3_pdf_path)
ocr_pdf_model_info = json_object.get("doc_layout_result")
ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
except Exception as e:
logger.exception(e)


def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, s3_config=None):
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
save_path = join_path(save_tmp_path, "md")
save_path_with_bookname = os.path.join(save_path, book_name)
text_content_save_path = f"{save_path_with_bookname}/book.md"
pdf_info_dict = parse_pdf_by_ocr(
ocr_pdf_path,
s3_config,
ocr_pdf_model_info,
save_path,
book_name,
debug_mode=True)

parent_dir = os.path.dirname(text_content_save_path)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)

# markdown_content = mk_nlp_markdown(pdf_info_dict)
markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
# markdown_pagination = ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict)

with open(text_content_save_path, "w", encoding="utf-8") as f:
f.write(markdown_content)

standard_format = make_standard_format_with_para(pdf_info_dict)
standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
with open(standard_format_save_path, "w", encoding="utf-8") as f:
# 将standard_format dump成json文本并保存
f.write(json.dumps(standard_format, ensure_ascii=False))


if __name__ == '__main__':
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
ocr_online_parse(book_name="数学新星网/edu_00001236")
ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
# ocr_local_parse(pdf_path, json_file_path)
book_name = "科数网/edu_00011318"
ocr_online_parse(book_name)

pass
105 changes: 89 additions & 16 deletions magic_pdf/dict2md/ocr_mkcontent.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,19 @@
from magic_pdf.libs.commons import s3_image_save_path, join_path
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
from magic_pdf.libs.ocr_content_type import ContentType
import wordninja
import re


def split_long_words(text):
segments = text.split(' ')
for i in range(len(segments)):
words = re.findall(r'\w+|[^\w\s]', segments[i], re.UNICODE)
for j in range(len(words)):
if len(words[j]) > 15:
words[j] = ' '.join(wordninja.split(words[j]))
segments[i] = ''.join(words)
return ' '.join(segments)


def ocr_mk_nlp_markdown(pdf_info_dict: dict):
Expand Down Expand Up @@ -58,37 +71,96 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
paras = page_info.get("para_blocks")
if not paras:
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
markdown.extend(page_markdown)
return '\n\n'.join(markdown)


def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict):
markdown = []
for _, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks")
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "nlp")
markdown.extend(page_markdown)
return '\n\n'.join(markdown)

def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
markdown_with_para_and_pagination = []
for page_no, page_info in pdf_info_dict.items():
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
markdown_with_para_and_pagination.append({
'page_no': page_no,
'md_content': '\n\n'.join(page_markdown)
})
return markdown_with_para_and_pagination


def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
page_markdown = []
for paras in paras_of_layout:
for para in paras:
para_text = ''
for line in para:
for span in line['spans']:
span_type = span.get('type')
content = ''
if span_type == ContentType.Text:
para_text += span['content']
content = ocr_escape_special_markdown_char(split_long_words(span['content']))
elif span_type == ContentType.InlineEquation:
para_text += f" ${span['content']}$ "
content = f"${ocr_escape_special_markdown_char(span['content'])}$"
elif span_type == ContentType.InterlineEquation:
para_text += f"$$\n{span['content']}\n$$ "
elif span_type == ContentType.Image:
para_text += f"![]({join_path(s3_image_save_path, span['image_path'])})"
markdown.append(para_text)
content = f"\n$$\n{ocr_escape_special_markdown_char(span['content'])}\n$$\n"
elif span_type in [ContentType.Image, ContentType.Table]:
if mode == 'mm':
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
elif mode == 'nlp':
pass
if content != '':
para_text += content + ' '
if para_text.strip() == '':
continue
else:
page_markdown.append(para_text.strip() + ' ')
return page_markdown

return '\n\n'.join(markdown)

def para_to_standard_format(para):
para_content = {}
if len(para) == 1:
para_content = line_to_standard_format(para[0])
elif len(para) > 1:
para_text = ''
inline_equation_num = 0
for line in para:
for span in line['spans']:
span_type = span.get('type')
if span_type == ContentType.Text:
content = ocr_escape_special_markdown_char(split_long_words(span['content']))
elif span_type == ContentType.InlineEquation:
content = f"${ocr_escape_special_markdown_char(span['content'])}$"
inline_equation_num += 1
para_text += content + ' '
para_content = {
'type': 'text',
'text': para_text,
'inline_equation_num': inline_equation_num
}
return para_content

def make_standard_format_with_para(pdf_info_dict: dict):
content_list = []
for _, page_info in pdf_info_dict.items():
paras = page_info.get("para_blocks")
if not paras:
paras_of_layout = page_info.get("para_blocks")
if not paras_of_layout:
continue
for para in paras:
for line in para:
content = line_to_standard_format(line)
content_list.append(content)
for paras in paras_of_layout:
for para in paras:
para_content = para_to_standard_format(para)
content_list.append(para_content)
return content_list


Expand Down Expand Up @@ -125,7 +197,8 @@ def line_to_standard_format(line):
line_text += f"${inline_equation}$"
inline_equation_num += 1
elif span['type'] == ContentType.Text:
line_text += span['content']
text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
line_text += text_content
content = {
'type': 'text',
'text': line_text,
Expand Down
27 changes: 27 additions & 0 deletions magic_pdf/libs/boxbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,33 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
y1_1 < y0_2 or # box1在box2的上边
y0_1 > y1_2) # box1在box2的下边

def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6):
"""
判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold
"""
if box1 is None or box2 is None:
return False

x0_1, y0_1, x1_1, y1_1 = box1
x0_2, y0_2, x1_2, y1_2 = box2

if not _is_in_or_part_overlap(box1, box2):
return False

# 计算重叠面积
x_left = max(x0_1, x0_2)
y_top = max(y0_1, y0_2)
x_right = min(x1_1, x1_2)
y_bottom = min(y1_1, y1_2)
overlap_area = (x_right - x_left) * (y_bottom - y_top)

# 计算box1的面积
box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)

return overlap_area / box1_area > area_ratio_threshold


def _is_in(box1, box2) -> bool:
"""
box1是否完全在box2里面
Expand Down
18 changes: 8 additions & 10 deletions magic_pdf/libs/draw_bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
page.insert_text((x0, y0), str(j + 1), fontsize=10, color=new_rgb) # Insert the index at the top left corner of the rectangle


def draw_layout_bbox(pdf_info_dict, input_path, out_path):
def draw_layout_bbox(pdf_info_dict, pdf_bytes, out_path):
layout_bbox_list = []
dropped_bbox_list = []
for page in pdf_info_dict.values():
Expand All @@ -40,15 +40,14 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
for dropped_bbox in dropped_bboxes:
page_dropped_list.append(dropped_bbox)
dropped_bbox_list.append(page_dropped_list)

doc = fitz.open(input_path)
for i, page in enumerate(doc):
pdf_docs = fitz.open("pdf", pdf_bytes)
for i, page in enumerate(pdf_docs):
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
# Save the PDF
doc.save(f"{out_path}/layout.pdf")
pdf_docs.save(f"{out_path}/layout.pdf")

def draw_text_bbox(pdf_info_dict, input_path, out_path):
def draw_text_bbox(pdf_info_dict, pdf_bytes, out_path):
text_list = []
inline_equation_list = []
interline_equation_list = []
Expand All @@ -68,13 +67,12 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path):
text_list.append(page_text_list)
inline_equation_list.append(page_inline_equation_list)
interline_equation_list.append(page_interline_equation_list)

doc = fitz.open(input_path)
for i, page in enumerate(doc):
pdf_docs = fitz.open("pdf", pdf_bytes)
for i, page in enumerate(pdf_docs):
# 获取当前页面的数据
draw_bbox_without_number(i, text_list, page, [255, 0, 0])
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])

# Save the PDF
doc.save(f"{out_path}/text.pdf")
pdf_docs.save(f"{out_path}/text.pdf")
Loading

0 comments on commit fffee0a

Please sign in to comment.