Skip to content

Commit 701f384

Browse files
committed
增加ocr版本解析功能
1 parent 2e487ca commit 701f384

File tree

5 files changed

+214
-1
lines changed

5 files changed

+214
-1
lines changed

demo/ocr_demo.py

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
3+
from loguru import logger
4+
5+
from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown
6+
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
7+
8+
9+
def save_markdown(markdown_text, input_filepath):
10+
# 获取输入文件的目录
11+
directory = os.path.dirname(input_filepath)
12+
# 获取输入文件的文件名(不带扩展名)
13+
base_name = os.path.basename(input_filepath)
14+
file_name_without_ext = os.path.splitext(base_name)[0]
15+
# 定义输出文件的路径
16+
output_filepath = os.path.join(directory, f"{file_name_without_ext}.md")
17+
18+
# 将Markdown文本写入.md文件
19+
with open(output_filepath, 'w', encoding='utf-8') as file:
20+
file.write(markdown_text)
21+
22+
23+
if __name__ == '__main__':
24+
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
25+
pdf_info_dict = parse_pdf_by_ocr(ocr_json_file_path)
26+
markdown_text = mk_nlp_markdown(pdf_info_dict)
27+
logger.info(markdown_text)
28+
save_markdown(markdown_text, ocr_json_file_path)
29+

magic_pdf/dict2md/ocr_mkcontent.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
def mk_nlp_markdown(pdf_info_dict: dict):
2+
3+
markdown = []
4+
5+
for _, page_info in pdf_info_dict.items():
6+
blocks = page_info.get("preproc_blocks")
7+
if not blocks:
8+
continue
9+
for block in blocks:
10+
for line in block['lines']:
11+
line_text = ''
12+
for span in line['spans']:
13+
content = span['content'].replace('$', '\$') # 转义$
14+
if span['type'] == 'inline_equation':
15+
content = f"${content}$"
16+
elif span['type'] == 'displayed_equation':
17+
content = f"$$\n{content}\n$$"
18+
line_text += content + ' '
19+
# 在行末添加两个空格以强制换行
20+
markdown.append(line_text.strip() + ' ')
21+
return '\n'.join(markdown)

magic_pdf/libs/boxbase.py

+33-1
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,20 @@ def __overlap_y(Ay1, Ay2, By1, By2):
119119
return x0_1<=x0_2<=x1_1 and vertical_overlap_cond
120120

121121

122+
def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
123+
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
124+
_, y0_1, _, y1_1 = bbox1
125+
_, y0_2, _, y1_2 = bbox2
126+
127+
overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
128+
height1, height2 = y1_1 - y0_1, y1_2 - y0_2
129+
max_height = max(height1, height2)
130+
min_height = min(height1, height2)
131+
132+
return (overlap / min_height) > overlap_ratio_threshold
133+
134+
135+
122136
def calculate_iou(bbox1, bbox2):
123137
# Determine the coordinates of the intersection rectangle
124138
x_left = max(bbox1[0], bbox2[0])
@@ -163,7 +177,25 @@ def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
163177
else:
164178
return intersection_area / min_box_area
165179

166-
180+
181+
def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
182+
"""
183+
通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
184+
如果比例大于ratio,则返回小的那个bbox,
185+
否则返回None
186+
"""
187+
x1_min, y1_min, x1_max, y1_max = bbox1
188+
x2_min, y2_min, x2_max, y2_max = bbox2
189+
area1 = (x1_max - x1_min) * (y1_max - y1_min)
190+
area2 = (x2_max - x2_min) * (y2_max - y2_min)
191+
overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
192+
if overlap_ratio > ratio and area1 < area2:
193+
return bbox1
194+
elif overlap_ratio > ratio and area2 < area1:
195+
return bbox2
196+
else:
197+
return None
198+
167199
def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
168200
x0, y0, x1, y1 = boundry
169201
new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]

magic_pdf/libs/ocr_dict_merge.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold
2+
3+
4+
def merge_spans(spans):
5+
# 按照y0坐标排序
6+
spans.sort(key=lambda span: span['bbox'][1])
7+
8+
lines = []
9+
current_line = [spans[0]]
10+
for span in spans[1:]:
11+
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
12+
if span['type'] == "displayed_equation" or any(s['type'] == "displayed_equation" for s in current_line):
13+
# 则开始新行
14+
lines.append(current_line)
15+
current_line = [span]
16+
continue
17+
18+
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
19+
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
20+
current_line.append(span)
21+
else:
22+
# 否则,开始新行
23+
lines.append(current_line)
24+
current_line = [span]
25+
26+
# 添加最后一行
27+
if current_line:
28+
lines.append(current_line)
29+
30+
# 计算每行的边界框,并对每行中的span按照x0进行排序
31+
line_objects = []
32+
for line in lines:
33+
# 按照x0坐标排序
34+
line.sort(key=lambda span: span['bbox'][0])
35+
line_bbox = [
36+
min(span['bbox'][0] for span in line), # x0
37+
min(span['bbox'][1] for span in line), # y0
38+
max(span['bbox'][2] for span in line), # x1
39+
max(span['bbox'][3] for span in line), # y1
40+
]
41+
line_objects.append({
42+
"bbox": line_bbox,
43+
"spans": line,
44+
})
45+
46+
return line_objects

magic_pdf/pdf_parse_by_ocr.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import json
2+
3+
from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio
4+
from magic_pdf.libs.ocr_dict_merge import merge_spans
5+
6+
7+
def read_json_file(file_path):
8+
with open(file_path, 'r') as f:
9+
data = json.load(f)
10+
return data
11+
12+
13+
def construct_page_component(page_id, text_blocks_preproc):
14+
return_dict = {
15+
'preproc_blocks': text_blocks_preproc,
16+
'page_idx': page_id
17+
}
18+
return return_dict
19+
20+
21+
def parse_pdf_by_ocr(
22+
ocr_json_file_path,
23+
start_page_id=0,
24+
end_page_id=None,
25+
):
26+
ocr_pdf_info = read_json_file(ocr_json_file_path)
27+
pdf_info_dict = {}
28+
end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1
29+
for page_id in range(start_page_id, end_page_id + 1):
30+
ocr_page_info = ocr_pdf_info[page_id]
31+
layout_dets = ocr_page_info['layout_dets']
32+
spans = []
33+
for layout_det in layout_dets:
34+
category_id = layout_det['category_id']
35+
allow_category_id_list = [13, 14, 15]
36+
if category_id in allow_category_id_list:
37+
x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
38+
bbox = [int(x0), int(y0), int(x1), int(y1)]
39+
# 13: 'embedding', # 嵌入公式
40+
# 14: 'isolated', # 单行公式
41+
# 15: 'ocr_text', # ocr识别文本
42+
span = {
43+
'bbox': bbox,
44+
}
45+
if category_id == 13:
46+
span['content'] = layout_det['latex']
47+
span['type'] = 'inline_equation'
48+
elif category_id == 14:
49+
span['content'] = layout_det['latex']
50+
span['type'] = 'displayed_equation'
51+
elif category_id == 15:
52+
span['content'] = layout_det['text']
53+
span['type'] = 'text'
54+
# print(span)
55+
spans.append(span)
56+
else:
57+
continue
58+
59+
# 合并重叠的spans
60+
for span1 in spans.copy():
61+
for span2 in spans.copy():
62+
if span1 != span2:
63+
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8)
64+
if overlap_box is not None:
65+
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
66+
if bbox_to_remove is not None:
67+
spans.remove(bbox_to_remove)
68+
69+
# 将spans合并成line
70+
lines = merge_spans(spans)
71+
72+
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
73+
blocks = []
74+
for line in lines:
75+
blocks.append({
76+
"bbox": line['bbox'],
77+
"lines": [line],
78+
})
79+
80+
# 构造pdf_info_dict
81+
page_info = construct_page_component(page_id, blocks)
82+
pdf_info_dict[f"page_{page_id}"] = page_info
83+
84+
return pdf_info_dict
85+

0 commit comments

Comments
 (0)