Skip to content

Commit 26c2378

Browse files
committed
ocr模式下content type 抽象
1 parent b6f051d commit 26c2378

File tree

7 files changed

+64
-48
lines changed

7 files changed

+64
-48
lines changed

magic_pdf/dict2md/ocr_mkcontent.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from magic_pdf.libs.ocr_content_type import ContentType
2+
3+
14
def mk_nlp_markdown(pdf_info_dict: dict):
25
markdown = []
36

@@ -12,9 +15,9 @@ def mk_nlp_markdown(pdf_info_dict: dict):
1215
if not span.get('content'):
1316
continue
1417
content = span['content'].replace('$', '\$') # 转义$
15-
if span['type'] == 'inline_equation':
18+
if span['type'] == ContentType.InlineEquation:
1619
content = f"${content}$"
17-
elif span['type'] == 'displayed_equation':
20+
elif span['type'] == ContentType.InterlineEquation:
1821
content = f"$$\n{content}\n$$"
1922
line_text += content + ' '
2023
# 在行末添加两个空格以强制换行
@@ -41,9 +44,9 @@ def mk_mm_markdown(pdf_info_dict: dict):
4144
content = f"![]({span['image_path']})"
4245
else:
4346
content = span['content'].replace('$', '\$') # 转义$
44-
if span['type'] == 'inline_equation':
47+
if span['type'] == ContentType.InlineEquation:
4548
content = f"${content}$"
46-
elif span['type'] == 'displayed_equation':
49+
elif span['type'] == ContentType.InterlineEquation:
4750
content = f"$$\n{content}\n$$"
4851
line_text += content + ' '
4952
# 在行末添加两个空格以强制换行

magic_pdf/libs/draw_bbox.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from magic_pdf.libs.commons import fitz # PyMuPDF
2+
from magic_pdf.libs.ocr_content_type import ContentType
3+
24

35
def draw_bbox_without_number(i, bbox_list, page, rgb_config):
46
new_rgb = []
@@ -49,30 +51,30 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
4951
def draw_text_bbox(pdf_info_dict, input_path, out_path):
5052
text_list = []
5153
inline_equation_list = []
52-
displayed_equation_list = []
54+
interline_equation_list = []
5355
for page in pdf_info_dict.values():
5456
page_text_list = []
5557
page_inline_equation_list = []
56-
page_displayed_equation_list = []
58+
page_interline_equation_list = []
5759
for block in page['preproc_blocks']:
5860
for line in block['lines']:
5961
for span in line['spans']:
60-
if span['type'] == 'text':
62+
if span['type'] == ContentType.Text:
6163
page_text_list.append(span['bbox'])
62-
elif span['type'] == 'inline_equation':
64+
elif span['type'] == ContentType.InlineEquation:
6365
page_inline_equation_list.append(span['bbox'])
64-
elif span['type'] == 'displayed_equation':
65-
page_displayed_equation_list.append(span['bbox'])
66+
elif span['type'] == ContentType.InterlineEquation:
67+
page_interline_equation_list.append(span['bbox'])
6668
text_list.append(page_text_list)
6769
inline_equation_list.append(page_inline_equation_list)
68-
displayed_equation_list.append(page_displayed_equation_list)
70+
interline_equation_list.append(page_interline_equation_list)
6971

7072
doc = fitz.open(input_path)
7173
for i, page in enumerate(doc):
7274
# 获取当前页面的数据
7375
draw_bbox_without_number(i, text_list, page, [255, 0, 0])
7476
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
75-
draw_bbox_without_number(i, displayed_equation_list, page, [0, 0, 255])
77+
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
7678

7779
# Save the PDF
7880
doc.save(f"{out_path}/text.pdf")

magic_pdf/libs/ocr_content_type.py

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
class ContentType:
2+
Image = "image"
3+
Table = "table"
4+
Text = "text"
5+
InlineEquation = "inline_equation"
6+
InterlineEquation = "interline_equation"
7+

magic_pdf/pdf_parse_by_ocr.py

+12-11
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
get_docx_model_output,
1515
)
1616
from magic_pdf.libs.coordinate_transform import get_scale_ratio
17+
from magic_pdf.libs.ocr_content_type import ContentType
1718
from magic_pdf.libs.safe_filename import sanitize_filename
1819
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
1920
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
@@ -44,10 +45,10 @@ def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, lay
4445
'tables': tables,
4546
'interline_equations': interline_equations,
4647
'inline_equations': inline_equations,
47-
'dropped_text_block': dropped_text_block,
48-
'dropped_image_block': dropped_image_block,
49-
'dropped_table_block': dropped_table_block,
50-
'dropped_bboxes': need_remove_spans_bboxes_dict,
48+
'droped_text_block': dropped_text_block,
49+
'droped_image_block': dropped_image_block,
50+
'droped_table_block': dropped_table_block,
51+
'droped_bboxes': need_remove_spans_bboxes_dict,
5152
}
5253
return return_dict
5354

@@ -164,7 +165,7 @@ def parse_pdf_by_ocr(
164165
# 1: 'image', # 图片
165166
# 7: 'table', # 表格
166167
# 13: 'inline_equation', # 行内公式
167-
# 14: 'displayed_equation', # 行间公式
168+
# 14: 'interline_equation', # 行间公式
168169
# 15: 'text', # ocr识别文本
169170
"""layout信息"""
170171
# 11: 'full column', # 单栏
@@ -173,20 +174,20 @@ def parse_pdf_by_ocr(
173174
"bbox": bbox,
174175
}
175176
if category_id == 1:
176-
span["type"] = "image"
177+
span["type"] = ContentType.Image
177178

178179
elif category_id == 7:
179-
span["type"] = "table"
180+
span["type"] = ContentType.Table
180181

181182
elif category_id == 13:
182183
span["content"] = layout_det["latex"]
183-
span["type"] = "inline_equation"
184+
span["type"] = ContentType.InlineEquation
184185
elif category_id == 14:
185186
span["content"] = layout_det["latex"]
186-
span["type"] = "displayed_equation"
187+
span["type"] = ContentType.InterlineEquation
187188
elif category_id == 15:
188189
span["content"] = layout_det["text"]
189-
span["type"] = "text"
190+
span["type"] = ContentType.Text
190191
# print(span)
191192
spans.append(span)
192193
else:
@@ -213,7 +214,7 @@ def parse_pdf_by_ocr(
213214
# bbox去除粘连
214215
spans = remove_overlap_between_bbox(spans)
215216

216-
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
217+
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
217218
spans = adjust_bbox_for_standalone_block(spans)
218219

219220

magic_pdf/pre_proc/ocr_cut_image.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from magic_pdf.libs.commons import join_path
2+
from magic_pdf.libs.ocr_content_type import ContentType
23
from magic_pdf.libs.pdf_image_tools import cut_image
34

45

@@ -11,9 +12,9 @@ def img_save_path(type):
1112

1213
for span in spans:
1314
span_type = span['type']
14-
if span_type == 'image':
15+
if span_type == ContentType.Image:
1516
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
16-
elif span_type == 'table':
17+
elif span_type == ContentType.Table:
1718
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
1819

1920
return spans

magic_pdf/pre_proc/ocr_dict_merge.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
44
calculate_overlap_area_in_bbox1_area_ratio
5+
from magic_pdf.libs.ocr_content_type import ContentType
56

67

78
# 将每一个line中的span从左到右排序
@@ -29,10 +30,10 @@ def merge_spans_to_line(spans):
2930
lines = []
3031
current_line = [spans[0]]
3132
for span in spans[1:]:
32-
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
33+
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
3334
# image和table类型,同上
34-
if span['type'] in ["displayed_equation", "image", "table"] or any(
35-
s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
35+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
36+
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
3637
# 则开始新行
3738
lines.append(current_line)
3839
current_line = [span]

magic_pdf/pre_proc/ocr_span_list_modify.py

+21-20
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
44
__is_overlaps_y_exceeds_threshold
5+
from magic_pdf.libs.ocr_content_type import ContentType
56

67

78
def remove_overlaps_min_spans(spans):
@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
4950
for span in need_remove_spans:
5051
spans.remove(span)
5152
span['tag'] = drop_tag
52-
if span['type'] in ['text', 'inline_equation', 'displayed_equation']:
53+
if span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
5354
dropped_text_block.append(span)
54-
elif span['type'] == 'image':
55+
elif span['type'] == ContentType.Image:
5556
dropped_image_block.append(span)
56-
elif span['type'] == 'table':
57+
elif span['type'] == ContentType.Table:
5758
dropped_table_block.append(span)
5859

5960
return spans, dropped_text_block, dropped_image_block, dropped_table_block
6061

6162

6263
def adjust_bbox_for_standalone_block(spans):
63-
# 对tpye=["displayed_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
64+
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
6465
for sb_span in spans:
65-
if sb_span['type'] in ["displayed_equation", "image", "table"]:
66+
if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
6667
for text_span in spans:
67-
if text_span['type'] in ['text', 'inline_equation']:
68+
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
6869
# 判断span2的纵向高度是否被span所覆盖
6970
if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
7071
# 判断span2是否在span左边
@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
8182

8283
lines = []
8384
current_line = [spans[0]]
84-
if spans[0]["type"] in ["displayed_equation", "image", "table"]:
85+
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
8586
displayed_list.append(spans[0])
8687

8788
line_first_y0 = spans[0]["bbox"][1]
@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
9192
for span in spans[1:]:
9293
# if span.get("content","") == "78.":
9394
# print("debug")
94-
# 如果当前的span类型为"displayed_equation" 或者 当前行中已经有"displayed_equation"
95+
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
9596
# image和table类型,同上
96-
if span['type'] in ["displayed_equation", "image", "table"] or any(
97-
s['type'] in ["displayed_equation", "image", "table"] for s in current_line):
97+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
98+
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
9899
# 传入
99-
if span["type"] in ["displayed_equation", "image", "table"]:
100+
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
100101
displayed_list.append(span)
101102
# 则开始新行
102103
lines.append(current_line)
103-
if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
104+
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
104105
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
105106
current_line = [span]
106107
line_first_y0 = span["bbox"][1]
@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
125126
# 添加最后一行
126127
if current_line:
127128
lines.append(current_line)
128-
if len(current_line) > 1 or current_line[0]["type"] in ["text", "inline_equation"]:
129+
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
129130
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
130131
for line in text_inline_lines:
131132
# 按照x0坐标排序
@@ -159,18 +160,18 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
159160
span['bbox'], (0, y0, 0, y1)):
160161

161162
# 调整公式类型
162-
if span["type"] == "displayed_equation":
163+
if span["type"] == ContentType.InterlineEquation:
163164
# 最后一行是行间公式
164165
if j + 1 >= len(text_inline_lines):
165-
span["type"] = "inline_equation"
166+
span["type"] = ContentType.InlineEquation
166167
span["bbox"][1] = y0
167168
span["bbox"][3] = y1
168169
else:
169170
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
170171
y0_next, y1_next = text_inline_lines[j + 1][1]
171172
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
172173
y1 - y0) > span_y - span_y0:
173-
span["type"] = "inline_equation"
174+
span["type"] = ContentType.InlineEquation
174175
span["bbox"][1] = y0
175176
span["bbox"][3] = y1
176177
break
@@ -193,13 +194,13 @@ def get_qa_need_list(blocks):
193194
for block in blocks:
194195
for line in block["lines"]:
195196
for span in line["spans"]:
196-
if span["type"] == "image":
197+
if span["type"] == ContentType.Image:
197198
images.append(span)
198-
elif span["type"] == "table":
199+
elif span["type"] == ContentType.Table:
199200
tables.append(span)
200-
elif span["type"] == "inline_equation":
201+
elif span["type"] == ContentType.InlineEquation:
201202
inline_equations.append(span)
202-
elif span["type"] == "displayed_equation":
203+
elif span["type"] == ContentType.InterlineEquation:
203204
interline_equations.append(span)
204205
else:
205206
continue

0 commit comments

Comments
 (0)