Skip to content

Commit fffee0a

Browse files
author
liusilu
committed
Merge branch 'master' of https://github.com/myhloli/Magic-PDF
2 parents e736062 + 7162deb commit fffee0a

19 files changed

+2350
-342
lines changed

demo/demo_test.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def get_json_from_local_or_s3(book_name=None):
3434
s3_config = get_s3_config(json_path)
3535
file_content = read_file(json_path, s3_config)
3636
json_str = file_content.decode("utf-8")
37-
logger.info(json_str)
37+
# logger.info(json_str)
3838
json_object = json.loads(json_str)
3939
return json_object
4040

demo/ocr_demo.py

+57-40
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,16 @@
44
from loguru import logger
55
from pathlib import Path
66

7+
from app.common.s3 import get_s3_config
78
from demo.demo_test import get_json_from_local_or_s3
8-
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format
9+
from magic_pdf.dict2md.ocr_mkcontent import (
10+
ocr_mk_mm_markdown_with_para,
11+
ocr_mk_nlp_markdown,
12+
ocr_mk_mm_markdown,
13+
ocr_mk_mm_standard_format,
14+
ocr_mk_mm_markdown_with_para_and_pagination,
15+
make_standard_format_with_para
16+
)
917
from magic_pdf.libs.commons import join_path
1018
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
1119

@@ -35,50 +43,59 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
3543
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
3644
pth = Path(ocr_json_file_path)
3745
book_name = pth.name
38-
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
39-
save_path = join_path(save_tmp_path, "md")
40-
save_path_with_bookname = os.path.join(save_path, book_name)
41-
text_content_save_path = f"{save_path_with_bookname}/book.md"
42-
pdf_info_dict = parse_pdf_by_ocr(
43-
ocr_pdf_path,
44-
None,
45-
ocr_pdf_model_info,
46-
save_path,
47-
book_name,
48-
debug_mode=True)
49-
50-
parent_dir = os.path.dirname(text_content_save_path)
51-
if not os.path.exists(parent_dir):
52-
os.makedirs(parent_dir)
53-
54-
# markdown_content = mk_nlp_markdown(pdf_info_dict)
55-
markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
56-
57-
with open(text_content_save_path, "w", encoding="utf-8") as f:
58-
f.write(markdown_content)
59-
60-
standard_format = ocr_mk_mm_standard_format(pdf_info_dict)
61-
standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
62-
with open(standard_format_save_path, "w", encoding="utf-8") as f:
63-
f.write(str(standard_format))
64-
65-
# logger.info(markdown_content)
66-
# save_markdown(markdown_text, ocr_json_file_path)
46+
ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info)
6747
except Exception as e:
6848
logger.exception(e)
6949

7050

7151
def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
72-
json_object = get_json_from_local_or_s3(book_name)
73-
logger.info(json_object)
52+
try:
53+
json_object = get_json_from_local_or_s3(book_name)
54+
# logger.info(json_object)
55+
s3_pdf_path = json_object["file_location"]
56+
s3_config = get_s3_config(s3_pdf_path)
57+
ocr_pdf_model_info = json_object.get("doc_layout_result")
58+
ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config)
59+
except Exception as e:
60+
logger.exception(e)
61+
62+
63+
def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, s3_config=None):
64+
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
65+
save_path = join_path(save_tmp_path, "md")
66+
save_path_with_bookname = os.path.join(save_path, book_name)
67+
text_content_save_path = f"{save_path_with_bookname}/book.md"
68+
pdf_info_dict = parse_pdf_by_ocr(
69+
ocr_pdf_path,
70+
s3_config,
71+
ocr_pdf_model_info,
72+
save_path,
73+
book_name,
74+
debug_mode=True)
75+
76+
parent_dir = os.path.dirname(text_content_save_path)
77+
if not os.path.exists(parent_dir):
78+
os.makedirs(parent_dir)
79+
80+
# markdown_content = mk_nlp_markdown(pdf_info_dict)
81+
markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict)
82+
# markdown_pagination = ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict)
83+
84+
with open(text_content_save_path, "w", encoding="utf-8") as f:
85+
f.write(markdown_content)
86+
87+
standard_format = make_standard_format_with_para(pdf_info_dict)
88+
standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt"
89+
with open(standard_format_save_path, "w", encoding="utf-8") as f:
90+
# 将standard_format dump成json文本并保存
91+
f.write(json.dumps(standard_format, ensure_ascii=False))
92+
7493

7594
if __name__ == '__main__':
76-
#ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
77-
#ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
78-
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
79-
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
80-
ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf"
81-
ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json"
82-
ocr_online_parse(book_name="数学新星网/edu_00001236")
83-
ocr_local_parse(ocr_pdf_path, ocr_json_file_path)
95+
pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf"
96+
json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json"
97+
# ocr_local_parse(pdf_path, json_file_path)
98+
book_name = "科数网/edu_00011318"
99+
ocr_online_parse(book_name)
100+
84101
pass

magic_pdf/dict2md/ocr_mkcontent.py

+89-16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,19 @@
11
from magic_pdf.libs.commons import s3_image_save_path, join_path
22
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
33
from magic_pdf.libs.ocr_content_type import ContentType
4+
import wordninja
5+
import re
6+
7+
8+
def split_long_words(text):
9+
segments = text.split(' ')
10+
for i in range(len(segments)):
11+
words = re.findall(r'\w+|[^\w\s]', segments[i], re.UNICODE)
12+
for j in range(len(words)):
13+
if len(words[j]) > 15:
14+
words[j] = ' '.join(wordninja.split(words[j]))
15+
segments[i] = ''.join(words)
16+
return ' '.join(segments)
417

518

619
def ocr_mk_nlp_markdown(pdf_info_dict: dict):
@@ -58,37 +71,96 @@ def ocr_mk_mm_markdown(pdf_info_dict: dict):
5871
def ocr_mk_mm_markdown_with_para(pdf_info_dict: dict):
5972
markdown = []
6073
for _, page_info in pdf_info_dict.items():
61-
paras = page_info.get("para_blocks")
62-
if not paras:
74+
paras_of_layout = page_info.get("para_blocks")
75+
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
76+
markdown.extend(page_markdown)
77+
return '\n\n'.join(markdown)
78+
79+
80+
def ocr_mk_nlp_markdown_with_para(pdf_info_dict: dict):
81+
markdown = []
82+
for _, page_info in pdf_info_dict.items():
83+
paras_of_layout = page_info.get("para_blocks")
84+
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "nlp")
85+
markdown.extend(page_markdown)
86+
return '\n\n'.join(markdown)
87+
88+
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: dict):
89+
markdown_with_para_and_pagination = []
90+
for page_no, page_info in pdf_info_dict.items():
91+
paras_of_layout = page_info.get("para_blocks")
92+
if not paras_of_layout:
6393
continue
94+
page_markdown = ocr_mk_mm_markdown_with_para_core(paras_of_layout, "mm")
95+
markdown_with_para_and_pagination.append({
96+
'page_no': page_no,
97+
'md_content': '\n\n'.join(page_markdown)
98+
})
99+
return markdown_with_para_and_pagination
100+
101+
102+
def ocr_mk_mm_markdown_with_para_core(paras_of_layout, mode):
103+
page_markdown = []
104+
for paras in paras_of_layout:
64105
for para in paras:
65106
para_text = ''
66107
for line in para:
67108
for span in line['spans']:
68109
span_type = span.get('type')
110+
content = ''
69111
if span_type == ContentType.Text:
70-
para_text += span['content']
112+
content = ocr_escape_special_markdown_char(split_long_words(span['content']))
71113
elif span_type == ContentType.InlineEquation:
72-
para_text += f" ${span['content']}$ "
114+
content = f"${ocr_escape_special_markdown_char(span['content'])}$"
73115
elif span_type == ContentType.InterlineEquation:
74-
para_text += f"$$\n{span['content']}\n$$ "
75-
elif span_type == ContentType.Image:
76-
para_text += f"![]({join_path(s3_image_save_path, span['image_path'])})"
77-
markdown.append(para_text)
116+
content = f"\n$$\n{ocr_escape_special_markdown_char(span['content'])}\n$$\n"
117+
elif span_type in [ContentType.Image, ContentType.Table]:
118+
if mode == 'mm':
119+
content = f"\n![]({join_path(s3_image_save_path, span['image_path'])})\n"
120+
elif mode == 'nlp':
121+
pass
122+
if content != '':
123+
para_text += content + ' '
124+
if para_text.strip() == '':
125+
continue
126+
else:
127+
page_markdown.append(para_text.strip() + ' ')
128+
return page_markdown
78129

79-
return '\n\n'.join(markdown)
80130

131+
def para_to_standard_format(para):
132+
para_content = {}
133+
if len(para) == 1:
134+
para_content = line_to_standard_format(para[0])
135+
elif len(para) > 1:
136+
para_text = ''
137+
inline_equation_num = 0
138+
for line in para:
139+
for span in line['spans']:
140+
span_type = span.get('type')
141+
if span_type == ContentType.Text:
142+
content = ocr_escape_special_markdown_char(split_long_words(span['content']))
143+
elif span_type == ContentType.InlineEquation:
144+
content = f"${ocr_escape_special_markdown_char(span['content'])}$"
145+
inline_equation_num += 1
146+
para_text += content + ' '
147+
para_content = {
148+
'type': 'text',
149+
'text': para_text,
150+
'inline_equation_num': inline_equation_num
151+
}
152+
return para_content
81153

82154
def make_standard_format_with_para(pdf_info_dict: dict):
83155
content_list = []
84156
for _, page_info in pdf_info_dict.items():
85-
paras = page_info.get("para_blocks")
86-
if not paras:
157+
paras_of_layout = page_info.get("para_blocks")
158+
if not paras_of_layout:
87159
continue
88-
for para in paras:
89-
for line in para:
90-
content = line_to_standard_format(line)
91-
content_list.append(content)
160+
for paras in paras_of_layout:
161+
for para in paras:
162+
para_content = para_to_standard_format(para)
163+
content_list.append(para_content)
92164
return content_list
93165

94166

@@ -125,7 +197,8 @@ def line_to_standard_format(line):
125197
line_text += f"${inline_equation}$"
126198
inline_equation_num += 1
127199
elif span['type'] == ContentType.Text:
128-
line_text += span['content']
200+
text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
201+
line_text += text_content
129202
content = {
130203
'type': 'text',
131204
'text': line_text,

magic_pdf/libs/boxbase.py

+27
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,33 @@ def _is_in_or_part_overlap(box1, box2) -> bool:
1818
y1_1 < y0_2 or # box1在box2的上边
1919
y0_1 > y1_2) # box1在box2的下边
2020

21+
def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6):
22+
"""
23+
判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold
24+
25+
"""
26+
if box1 is None or box2 is None:
27+
return False
28+
29+
x0_1, y0_1, x1_1, y1_1 = box1
30+
x0_2, y0_2, x1_2, y1_2 = box2
31+
32+
if not _is_in_or_part_overlap(box1, box2):
33+
return False
34+
35+
# 计算重叠面积
36+
x_left = max(x0_1, x0_2)
37+
y_top = max(y0_1, y0_2)
38+
x_right = min(x1_1, x1_2)
39+
y_bottom = min(y1_1, y1_2)
40+
overlap_area = (x_right - x_left) * (y_bottom - y_top)
41+
42+
# 计算box1的面积
43+
box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
44+
45+
return overlap_area / box1_area > area_ratio_threshold
46+
47+
2148
def _is_in(box1, box2) -> bool:
2249
"""
2350
box1是否完全在box2里面

magic_pdf/libs/draw_bbox.py

+8-10
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def draw_bbox_with_number(i, bbox_list, page, rgb_config):
2727
page.insert_text((x0, y0), str(j + 1), fontsize=10, color=new_rgb) # Insert the index at the top left corner of the rectangle
2828

2929

30-
def draw_layout_bbox(pdf_info_dict, input_path, out_path):
30+
def draw_layout_bbox(pdf_info_dict, pdf_bytes, out_path):
3131
layout_bbox_list = []
3232
dropped_bbox_list = []
3333
for page in pdf_info_dict.values():
@@ -40,15 +40,14 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
4040
for dropped_bbox in dropped_bboxes:
4141
page_dropped_list.append(dropped_bbox)
4242
dropped_bbox_list.append(page_dropped_list)
43-
44-
doc = fitz.open(input_path)
45-
for i, page in enumerate(doc):
43+
pdf_docs = fitz.open("pdf", pdf_bytes)
44+
for i, page in enumerate(pdf_docs):
4645
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
4746
draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
4847
# Save the PDF
49-
doc.save(f"{out_path}/layout.pdf")
48+
pdf_docs.save(f"{out_path}/layout.pdf")
5049

51-
def draw_text_bbox(pdf_info_dict, input_path, out_path):
50+
def draw_text_bbox(pdf_info_dict, pdf_bytes, out_path):
5251
text_list = []
5352
inline_equation_list = []
5453
interline_equation_list = []
@@ -68,13 +67,12 @@ def draw_text_bbox(pdf_info_dict, input_path, out_path):
6867
text_list.append(page_text_list)
6968
inline_equation_list.append(page_inline_equation_list)
7069
interline_equation_list.append(page_interline_equation_list)
71-
72-
doc = fitz.open(input_path)
73-
for i, page in enumerate(doc):
70+
pdf_docs = fitz.open("pdf", pdf_bytes)
71+
for i, page in enumerate(pdf_docs):
7472
# 获取当前页面的数据
7573
draw_bbox_without_number(i, text_list, page, [255, 0, 0])
7674
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
7775
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
7876

7977
# Save the PDF
80-
doc.save(f"{out_path}/text.pdf")
78+
pdf_docs.save(f"{out_path}/text.pdf")

0 commit comments

Comments
 (0)