2
2
3
3
from magic_pdf .libs .boxbase import calculate_overlap_area_in_bbox1_area_ratio , get_minbox_if_overlap_by_ratio , \
4
4
__is_overlaps_y_exceeds_threshold
5
+ from magic_pdf .libs .ocr_content_type import ContentType
5
6
6
7
7
8
def remove_overlaps_min_spans (spans ):
@@ -49,22 +50,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
49
50
for span in need_remove_spans :
50
51
spans .remove (span )
51
52
span ['tag' ] = drop_tag
52
- if span ['type' ] in ['text' , 'inline_equation' , 'displayed_equation' ]:
53
+ if span ['type' ] in [ContentType . Text , ContentType . InlineEquation , ContentType . InterlineEquation ]:
53
54
dropped_text_block .append (span )
54
- elif span ['type' ] == 'image' :
55
+ elif span ['type' ] == ContentType . Image :
55
56
dropped_image_block .append (span )
56
- elif span ['type' ] == 'table' :
57
+ elif span ['type' ] == ContentType . Table :
57
58
dropped_table_block .append (span )
58
59
59
60
return spans , dropped_text_block , dropped_image_block , dropped_table_block
60
61
61
62
62
63
def adjust_bbox_for_standalone_block (spans ):
63
- # 对tpye=["displayed_equation ", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
64
+ # 对tpye=["interline_equation ", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
64
65
for sb_span in spans :
65
- if sb_span ['type' ] in ["displayed_equation" , "image" , "table" ]:
66
+ if sb_span ['type' ] in [ContentType . InterlineEquation , ContentType . Image , ContentType . Table ]:
66
67
for text_span in spans :
67
- if text_span ['type' ] in ['text' , 'inline_equation' ]:
68
+ if text_span ['type' ] in [ContentType . Text , ContentType . InlineEquation ]:
68
69
# 判断span2的纵向高度是否被span所覆盖
69
70
if sb_span ['bbox' ][1 ] < text_span ['bbox' ][1 ] and sb_span ['bbox' ][3 ] > text_span ['bbox' ][3 ]:
70
71
# 判断span2是否在span左边
@@ -81,7 +82,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
81
82
82
83
lines = []
83
84
current_line = [spans [0 ]]
84
- if spans [0 ]["type" ] in ["displayed_equation" , "image" , "table" ]:
85
+ if spans [0 ]["type" ] in [ContentType . InterlineEquation , ContentType . Image , ContentType . Table ]:
85
86
displayed_list .append (spans [0 ])
86
87
87
88
line_first_y0 = spans [0 ]["bbox" ][1 ]
@@ -91,16 +92,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
91
92
for span in spans [1 :]:
92
93
# if span.get("content","") == "78.":
93
94
# print("debug")
94
- # 如果当前的span类型为"displayed_equation " 或者 当前行中已经有"displayed_equation "
95
+ # 如果当前的span类型为"interline_equation " 或者 当前行中已经有"interline_equation "
95
96
# image和table类型,同上
96
- if span ['type' ] in ["displayed_equation" , "image" , "table" ] or any (
97
- s ['type' ] in ["displayed_equation" , "image" , "table" ] for s in current_line ):
97
+ if span ['type' ] in [ContentType . InterlineEquation , ContentType . Image , ContentType . Table ] or any (
98
+ s ['type' ] in [ContentType . InterlineEquation , ContentType . Image , ContentType . Table ] for s in current_line ):
98
99
# 传入
99
- if span ["type" ] in ["displayed_equation" , "image" , "table" ]:
100
+ if span ["type" ] in [ContentType . InterlineEquation , ContentType . Image , ContentType . Table ]:
100
101
displayed_list .append (span )
101
102
# 则开始新行
102
103
lines .append (current_line )
103
- if len (current_line ) > 1 or current_line [0 ]["type" ] in ["text" , "inline_equation" ]:
104
+ if len (current_line ) > 1 or current_line [0 ]["type" ] in [ContentType . Text , ContentType . InlineEquation ]:
104
105
text_inline_lines .append ((current_line , (line_first_y0 , line_first_y )))
105
106
current_line = [span ]
106
107
line_first_y0 = span ["bbox" ][1 ]
@@ -125,7 +126,7 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
125
126
# 添加最后一行
126
127
if current_line :
127
128
lines .append (current_line )
128
- if len (current_line ) > 1 or current_line [0 ]["type" ] in ["text" , "inline_equation" ]:
129
+ if len (current_line ) > 1 or current_line [0 ]["type" ] in [ContentType . Text , ContentType . InlineEquation ]:
129
130
text_inline_lines .append ((current_line , (line_first_y0 , line_first_y )))
130
131
for line in text_inline_lines :
131
132
# 按照x0坐标排序
@@ -159,18 +160,18 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
159
160
span ['bbox' ], (0 , y0 , 0 , y1 )):
160
161
161
162
# 调整公式类型
162
- if span ["type" ] == "displayed_equation" :
163
+ if span ["type" ] == ContentType . InterlineEquation :
163
164
# 最后一行是行间公式
164
165
if j + 1 >= len (text_inline_lines ):
165
- span ["type" ] = "inline_equation"
166
+ span ["type" ] = ContentType . InlineEquation
166
167
span ["bbox" ][1 ] = y0
167
168
span ["bbox" ][3 ] = y1
168
169
else :
169
170
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
170
171
y0_next , y1_next = text_inline_lines [j + 1 ][1 ]
171
172
if not __is_overlaps_y_exceeds_threshold (span ['bbox' ], (0 , y0_next , 0 , y1_next )) and 3 * (
172
173
y1 - y0 ) > span_y - span_y0 :
173
- span ["type" ] = "inline_equation"
174
+ span ["type" ] = ContentType . InlineEquation
174
175
span ["bbox" ][1 ] = y0
175
176
span ["bbox" ][3 ] = y1
176
177
break
@@ -193,13 +194,13 @@ def get_qa_need_list(blocks):
193
194
for block in blocks :
194
195
for line in block ["lines" ]:
195
196
for span in line ["spans" ]:
196
- if span ["type" ] == "image" :
197
+ if span ["type" ] == ContentType . Image :
197
198
images .append (span )
198
- elif span ["type" ] == "table" :
199
+ elif span ["type" ] == ContentType . Table :
199
200
tables .append (span )
200
- elif span ["type" ] == "inline_equation" :
201
+ elif span ["type" ] == ContentType . InlineEquation :
201
202
inline_equations .append (span )
202
- elif span ["type" ] == "displayed_equation" :
203
+ elif span ["type" ] == ContentType . InterlineEquation :
203
204
interline_equations .append (span )
204
205
else :
205
206
continue
0 commit comments