Skip to content

Commit a013564

Browse files
committed
修复spans为空list导致的IndexError: list index out of range
1 parent f10b4a5 commit a013564

File tree

2 files changed

+89
-83
lines changed

2 files changed

+89
-83
lines changed

magic_pdf/pre_proc/ocr_dict_merge.py

+29-26
Original file line numberDiff line numberDiff line change
@@ -24,34 +24,37 @@ def line_sort_spans_by_left_to_right(lines):
2424
return line_objects
2525

2626
def merge_spans_to_line(spans):
27-
# 按照y0坐标排序
28-
spans.sort(key=lambda span: span['bbox'][1])
29-
30-
lines = []
31-
current_line = [spans[0]]
32-
for span in spans[1:]:
33-
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
34-
# image和table类型,同上
35-
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
36-
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
37-
# 则开始新行
38-
lines.append(current_line)
39-
current_line = [span]
40-
continue
41-
42-
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
43-
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
44-
current_line.append(span)
45-
else:
46-
# 否则,开始新行
27+
if len(spans) == 0:
28+
return []
29+
else:
30+
# 按照y0坐标排序
31+
spans.sort(key=lambda span: span['bbox'][1])
32+
33+
lines = []
34+
current_line = [spans[0]]
35+
for span in spans[1:]:
36+
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
37+
# image和table类型,同上
38+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
39+
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
40+
# 则开始新行
41+
lines.append(current_line)
42+
current_line = [span]
43+
continue
44+
45+
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
46+
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
47+
current_line.append(span)
48+
else:
49+
# 否则,开始新行
50+
lines.append(current_line)
51+
current_line = [span]
52+
53+
# 添加最后一行
54+
if current_line:
4755
lines.append(current_line)
48-
current_line = [span]
4956

50-
# 添加最后一行
51-
if current_line:
52-
lines.append(current_line)
53-
54-
return lines
57+
return lines
5558

5659
def merge_spans_to_line_by_layout(spans, layout_bboxes):
5760
lines = []

magic_pdf/pre_proc/ocr_span_list_modify.py

+60-57
Original file line numberDiff line numberDiff line change
@@ -77,70 +77,73 @@ def adjust_bbox_for_standalone_block(spans):
7777

7878
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
7979
# displayed_list = []
80+
# 如果spans为空,则不处理
81+
if len(spans) == 0:
82+
pass
83+
else:
84+
spans.sort(key=lambda span: span['bbox'][1])
85+
86+
lines = []
87+
current_line = [spans[0]]
88+
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
89+
displayed_list.append(spans[0])
90+
91+
line_first_y0 = spans[0]["bbox"][1]
92+
line_first_y = spans[0]["bbox"][3]
93+
# 用于给行间公式搜索
94+
# text_inline_lines = []
95+
for span in spans[1:]:
96+
# if span.get("content","") == "78.":
97+
# print("debug")
98+
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
99+
# image和table类型,同上
100+
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
101+
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
102+
# 传入
103+
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
104+
displayed_list.append(span)
105+
# 则开始新行
106+
lines.append(current_line)
107+
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
108+
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
109+
current_line = [span]
110+
line_first_y0 = span["bbox"][1]
111+
line_first_y = span["bbox"][3]
112+
continue
80113

81-
spans.sort(key=lambda span: span['bbox'][1])
82-
83-
lines = []
84-
current_line = [spans[0]]
85-
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
86-
displayed_list.append(spans[0])
114+
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
115+
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
116+
if span["type"] == "text":
117+
line_first_y0 = span["bbox"][1]
118+
line_first_y = span["bbox"][3]
119+
current_line.append(span)
87120

88-
line_first_y0 = spans[0]["bbox"][1]
89-
line_first_y = spans[0]["bbox"][3]
90-
# 用于给行间公式搜索
91-
# text_inline_lines = []
92-
for span in spans[1:]:
93-
# if span.get("content","") == "78.":
94-
# print("debug")
95-
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
96-
# image和table类型,同上
97-
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
98-
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
99-
# 传入
100-
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
101-
displayed_list.append(span)
102-
# 则开始新行
103-
lines.append(current_line)
104-
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
121+
else:
122+
# 否则,开始新行
123+
lines.append(current_line)
105124
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
106-
current_line = [span]
107-
line_first_y0 = span["bbox"][1]
108-
line_first_y = span["bbox"][3]
109-
continue
110-
111-
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
112-
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
113-
if span["type"] == "text":
125+
current_line = [span]
114126
line_first_y0 = span["bbox"][1]
115127
line_first_y = span["bbox"][3]
116-
current_line.append(span)
117128

118-
else:
119-
# 否则,开始新行
129+
# 添加最后一行
130+
if current_line:
120131
lines.append(current_line)
121-
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
122-
current_line = [span]
123-
line_first_y0 = span["bbox"][1]
124-
line_first_y = span["bbox"][3]
125-
126-
# 添加最后一行
127-
if current_line:
128-
lines.append(current_line)
129-
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
130-
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
131-
for line in text_inline_lines:
132-
# 按照x0坐标排序
133-
current_line = line[0]
134-
current_line.sort(key=lambda span: span['bbox'][0])
135-
136-
# 调整每一个文字行内bbox统一
137-
for line in text_inline_lines:
138-
current_line, (line_first_y0, line_first_y) = line
139-
for span in current_line:
140-
span["bbox"][1] = line_first_y0
141-
span["bbox"][3] = line_first_y
142-
143-
# return spans, displayed_list, text_inline_lines
132+
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
133+
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
134+
for line in text_inline_lines:
135+
# 按照x0坐标排序
136+
current_line = line[0]
137+
current_line.sort(key=lambda span: span['bbox'][0])
138+
139+
# 调整每一个文字行内bbox统一
140+
for line in text_inline_lines:
141+
current_line, (line_first_y0, line_first_y) = line
142+
for span in current_line:
143+
span["bbox"][1] = line_first_y0
144+
span["bbox"][3] = line_first_y
145+
146+
# return spans, displayed_list, text_inline_lines
144147

145148

146149
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):

0 commit comments

Comments
 (0)