Skip to content

Commit fcea39d

Browse files
committed
增加ocr模式的layout解析功能
1 parent 00f3e32 commit fcea39d

File tree

4 files changed

+140
-15
lines changed

4 files changed

+140
-15
lines changed

demo/ocr_demo.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,12 @@ def read_json_file(file_path):
2828

2929

3030
if __name__ == '__main__':
31-
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
32-
ocr_pdf_info = read_json_file(ocr_json_file_path)
33-
pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
34-
markdown_text = mk_nlp_markdown(pdf_info_dict)
35-
logger.info(markdown_text)
36-
save_markdown(markdown_text, ocr_json_file_path)
37-
31+
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_1(3).json"
32+
try:
33+
ocr_pdf_info = read_json_file(ocr_json_file_path)
34+
pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
35+
markdown_text = mk_nlp_markdown(pdf_info_dict)
36+
logger.info(markdown_text)
37+
save_markdown(markdown_text, ocr_json_file_path)
38+
except Exception as e:
39+
logger.error(e)

magic_pdf/pdf_parse_by_ocr.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
from loguru import logger
1+
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
2+
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
23

3-
from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans
44

5-
6-
def construct_page_component(page_id, blocks):
5+
def construct_page_component(page_id, blocks, layout_bboxes):
76
return_dict = {
87
'preproc_blocks': blocks,
98
'page_idx': page_id,
9+
'layout_bboxes': layout_bboxes,
1010
}
1111
return return_dict
1212

@@ -74,9 +74,6 @@ def parse_pdf_by_ocr(
7474
lines = merge_spans_to_line(spans)
7575
# logger.info(lines)
7676

77-
# 从ocr_page_info中获取layout信息
78-
79-
8077
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
8178
blocks = []
8279
for line in lines:
@@ -85,8 +82,11 @@ def parse_pdf_by_ocr(
8582
"lines": [line],
8683
})
8784

85+
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
86+
layout_bboxes = layout_detect(ocr_page_info['subfield_dets'])
87+
8888
# 构造pdf_info_dict
89-
page_info = construct_page_component(page_id, blocks)
89+
page_info = construct_page_component(page_id, blocks, layout_bboxes)
9090
pdf_info_dict[f"page_{page_id}"] = page_info
9191

9292
return pdf_info_dict
+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
2+
3+
def get_center_point(bbox):
4+
"""
5+
根据边界框坐标信息,计算出该边界框的中心点坐标。
6+
Args:
7+
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
8+
Returns:
9+
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
10+
"""
11+
return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
12+
13+
14+
def get_area(bbox):
15+
"""
16+
根据边界框坐标信息,计算出该边界框的面积。
17+
Args:
18+
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
19+
Returns:
20+
float: 该边界框的面积。
21+
"""
22+
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
23+
24+
25+
def adjust_layouts(layout_bboxes):
26+
# 遍历所有布局框
27+
for i in range(len(layout_bboxes)):
28+
# 遍历当前布局框之后的布局框
29+
for j in range(i + 1, len(layout_bboxes)):
30+
# 判断两个布局框是否重叠
31+
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
32+
# 计算每个布局框的中心点坐标和面积
33+
center_i = get_center_point(layout_bboxes[i]["layout_bbox"])
34+
area_i = get_area(layout_bboxes[i]["layout_bbox"])
35+
36+
center_j = get_center_point(layout_bboxes[j]["layout_bbox"])
37+
area_j = get_area(layout_bboxes[j]["layout_bbox"])
38+
39+
# 计算横向和纵向的距离差
40+
dx = abs(center_i[0] - center_j[0])
41+
dy = abs(center_i[1] - center_j[1])
42+
43+
# 较大布局框和较小布局框的赋值
44+
if area_i > area_j:
45+
larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
46+
else:
47+
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
48+
49+
# 根据距离差判断重叠方向并修正边界
50+
if dx > dy: # 左右重叠
51+
if larger_layout["layout_bbox"][0] < smaller_layout["layout_bbox"][2]:
52+
larger_layout["layout_bbox"][0] = smaller_layout["layout_bbox"][2]
53+
else:
54+
larger_layout["layout_bbox"][2] = smaller_layout["layout_bbox"][0]
55+
else: # 上下重叠
56+
if larger_layout["layout_bbox"][1] < smaller_layout["layout_bbox"][3]:
57+
larger_layout["layout_bbox"][1] = smaller_layout["layout_bbox"][3]
58+
else:
59+
larger_layout["layout_bbox"][3] = smaller_layout["layout_bbox"][1]
60+
61+
# 返回排序调整后的布局边界框列表
62+
return layout_bboxes
63+
64+
65+
66+
67+
def layout_detect(layout_info):
68+
"""
69+
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
70+
71+
Args:
72+
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
73+
74+
Returns:
75+
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
76+
77+
"""
78+
# 初始化布局边界框列表
79+
layout_bboxes = []
80+
# 遍历每个子布局
81+
for sub_layout in layout_info:
82+
# 提取子布局的边界框坐标信息
83+
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
84+
# 创建子布局的边界框字典
85+
layout_bbox = {
86+
"layout_bbox": [x0, y0, x1, y1],
87+
}
88+
# 将子布局的边界框添加到列表中
89+
layout_bboxes.append(layout_bbox)
90+
91+
# 初始化新的布局边界框列表
92+
new_layout_bboxes = []
93+
# 遍历每个布局边界框
94+
for i in range(len(layout_bboxes)):
95+
# 初始化标记变量,用于判断当前边界框是否需要保留
96+
keep = True
97+
# 获取当前边界框的坐标信息
98+
box_i = layout_bboxes[i]["layout_bbox"]
99+
100+
# 遍历其他边界框
101+
for j in range(len(layout_bboxes)):
102+
# 排除当前边界框自身
103+
if i != j:
104+
# 获取其他边界框的坐标信息
105+
box_j = layout_bboxes[j]["layout_bbox"]
106+
# 检测box_i是否被box_j包含
107+
if _is_in(box_i, box_j):
108+
# 如果当前边界框被其他边界框包含,则标记为不需要保留
109+
keep = False
110+
# 跳出内层循环
111+
break
112+
113+
# 如果当前边界框需要保留,则添加到新的布局边界框列表中
114+
if keep:
115+
new_layout_bboxes.append(layout_bboxes[i])
116+
117+
# 对新的布局边界框列表进行排序调整
118+
layout_bboxes = adjust_layouts(new_layout_bboxes)
119+
120+
# 返回排序调整后的布局边界框列表
121+
return layout_bboxes
122+
123+

0 commit comments

Comments
 (0)