@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans):
77
77
78
78
def modify_y_axis (spans : list , displayed_list : list , text_inline_lines : list ):
79
79
# displayed_list = []
80
+ # 如果spans为空,则不处理
81
+ if len (spans ) == 0 :
82
+ pass
83
+ else :
84
+ spans .sort (key = lambda span : span ['bbox' ][1 ])
85
+
86
+ lines = []
87
+ current_line = [spans [0 ]]
88
+ if spans [0 ]["type" ] in [ContentType .InterlineEquation , ContentType .Image , ContentType .Table ]:
89
+ displayed_list .append (spans [0 ])
90
+
91
+ line_first_y0 = spans [0 ]["bbox" ][1 ]
92
+ line_first_y = spans [0 ]["bbox" ][3 ]
93
+ # 用于给行间公式搜索
94
+ # text_inline_lines = []
95
+ for span in spans [1 :]:
96
+ # if span.get("content","") == "78.":
97
+ # print("debug")
98
+ # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
99
+ # image和table类型,同上
100
+ if span ['type' ] in [ContentType .InterlineEquation , ContentType .Image , ContentType .Table ] or any (
101
+ s ['type' ] in [ContentType .InterlineEquation , ContentType .Image , ContentType .Table ] for s in current_line ):
102
+ # 传入
103
+ if span ["type" ] in [ContentType .InterlineEquation , ContentType .Image , ContentType .Table ]:
104
+ displayed_list .append (span )
105
+ # 则开始新行
106
+ lines .append (current_line )
107
+ if len (current_line ) > 1 or current_line [0 ]["type" ] in [ContentType .Text , ContentType .InlineEquation ]:
108
+ text_inline_lines .append ((current_line , (line_first_y0 , line_first_y )))
109
+ current_line = [span ]
110
+ line_first_y0 = span ["bbox" ][1 ]
111
+ line_first_y = span ["bbox" ][3 ]
112
+ continue
80
113
81
- spans . sort ( key = lambda span : span [ 'bbox' ][ 1 ])
82
-
83
- lines = []
84
- current_line = [ spans [ 0 ] ]
85
- if spans [ 0 ][ "type" ] in [ ContentType . InterlineEquation , ContentType . Image , ContentType . Table ]:
86
- displayed_list .append (spans [ 0 ] )
114
+ # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
115
+ if __is_overlaps_y_exceeds_threshold ( span [ 'bbox' ], current_line [ - 1 ][ 'bbox' ]):
116
+ if span [ "type" ] == "text" :
117
+ line_first_y0 = span [ "bbox" ][ 1 ]
118
+ line_first_y = span [ "bbox" ][ 3 ]
119
+ current_line .append (span )
87
120
88
- line_first_y0 = spans [0 ]["bbox" ][1 ]
89
- line_first_y = spans [0 ]["bbox" ][3 ]
90
- # 用于给行间公式搜索
91
- # text_inline_lines = []
92
- for span in spans [1 :]:
93
- # if span.get("content","") == "78.":
94
- # print("debug")
95
- # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
96
- # image和table类型,同上
97
- if span ['type' ] in [ContentType .InterlineEquation , ContentType .Image , ContentType .Table ] or any (
98
- s ['type' ] in [ContentType .InterlineEquation , ContentType .Image , ContentType .Table ] for s in current_line ):
99
- # 传入
100
- if span ["type" ] in [ContentType .InterlineEquation , ContentType .Image , ContentType .Table ]:
101
- displayed_list .append (span )
102
- # 则开始新行
103
- lines .append (current_line )
104
- if len (current_line ) > 1 or current_line [0 ]["type" ] in [ContentType .Text , ContentType .InlineEquation ]:
121
+ else :
122
+ # 否则,开始新行
123
+ lines .append (current_line )
105
124
text_inline_lines .append ((current_line , (line_first_y0 , line_first_y )))
106
- current_line = [span ]
107
- line_first_y0 = span ["bbox" ][1 ]
108
- line_first_y = span ["bbox" ][3 ]
109
- continue
110
-
111
- # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
112
- if __is_overlaps_y_exceeds_threshold (span ['bbox' ], current_line [- 1 ]['bbox' ]):
113
- if span ["type" ] == "text" :
125
+ current_line = [span ]
114
126
line_first_y0 = span ["bbox" ][1 ]
115
127
line_first_y = span ["bbox" ][3 ]
116
- current_line .append (span )
117
128
118
- else :
119
- # 否则,开始新行
129
+ # 添加最后一行
130
+ if current_line :
120
131
lines .append (current_line )
121
- text_inline_lines .append ((current_line , (line_first_y0 , line_first_y )))
122
- current_line = [span ]
123
- line_first_y0 = span ["bbox" ][1 ]
124
- line_first_y = span ["bbox" ][3 ]
125
-
126
- # 添加最后一行
127
- if current_line :
128
- lines .append (current_line )
129
- if len (current_line ) > 1 or current_line [0 ]["type" ] in [ContentType .Text , ContentType .InlineEquation ]:
130
- text_inline_lines .append ((current_line , (line_first_y0 , line_first_y )))
131
- for line in text_inline_lines :
132
- # 按照x0坐标排序
133
- current_line = line [0 ]
134
- current_line .sort (key = lambda span : span ['bbox' ][0 ])
135
-
136
- # 调整每一个文字行内bbox统一
137
- for line in text_inline_lines :
138
- current_line , (line_first_y0 , line_first_y ) = line
139
- for span in current_line :
140
- span ["bbox" ][1 ] = line_first_y0
141
- span ["bbox" ][3 ] = line_first_y
142
-
143
- # return spans, displayed_list, text_inline_lines
132
+ if len (current_line ) > 1 or current_line [0 ]["type" ] in [ContentType .Text , ContentType .InlineEquation ]:
133
+ text_inline_lines .append ((current_line , (line_first_y0 , line_first_y )))
134
+ for line in text_inline_lines :
135
+ # 按照x0坐标排序
136
+ current_line = line [0 ]
137
+ current_line .sort (key = lambda span : span ['bbox' ][0 ])
138
+
139
+ # 调整每一个文字行内bbox统一
140
+ for line in text_inline_lines :
141
+ current_line , (line_first_y0 , line_first_y ) = line
142
+ for span in current_line :
143
+ span ["bbox" ][1 ] = line_first_y0
144
+ span ["bbox" ][3 ] = line_first_y
145
+
146
+ # return spans, displayed_list, text_inline_lines
144
147
145
148
146
149
def modify_inline_equation (spans : list , displayed_list : list , text_inline_lines : list ):
0 commit comments