Skip to content

Commit 8a2736a

Browse files
committed
截图增加s3上传逻辑,移除宽或高为0的spans
1 parent 0b35b73 commit 8a2736a

File tree

3 files changed

+11
-11
lines changed

3 files changed

+11
-11
lines changed

demo/ocr_demo.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from loguru import logger
55
from pathlib import Path
66

7-
from magic_pdf.dict2md.ocr_mkcontent import mk_nlp_markdown, mk_mm_markdown
7+
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
88
from magic_pdf.libs.commons import join_path
99
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
1010

@@ -30,12 +30,12 @@ def read_json_file(file_path):
3030

3131

3232
if __name__ == '__main__':
33-
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
34-
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
33+
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
34+
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
3535
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
3636
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
37-
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
38-
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1.json"
37+
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
38+
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix.json"
3939
try:
4040
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
4141
pth = Path(ocr_json_file_path)
@@ -56,8 +56,8 @@ def read_json_file(file_path):
5656
if not os.path.exists(parent_dir):
5757
os.makedirs(parent_dir)
5858

59-
# markdown_content = mk_nlp_markdown(pdf_info_dict)
60-
markdown_content = mk_mm_markdown(pdf_info_dict)
59+
# markdown_content = ocr_mk_nlp_markdown(pdf_info_dict)
60+
markdown_content = ocr_mk_mm_markdown(pdf_info_dict)
6161

6262
with open(text_content_save_path, "w", encoding="utf-8") as f:
6363
f.write(markdown_content)

magic_pdf/pdf_parse_by_ocr.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ def parse_pdf_by_ocr(
208208
spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
209209

210210
# 对image和table截图
211-
spans = cut_image_and_table(spans, page, page_id, book_name, save_path)
211+
spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
212212

213213
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
214214
displayed_list = []

magic_pdf/pre_proc/ocr_cut_image.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from magic_pdf.libs.pdf_image_tools import cut_image
44

55

6-
def cut_image_and_table(spans, page, page_id, book_name, save_path):
6+
def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
77
def s3_return_path(type):
88
return join_path(book_name, type)
99

@@ -13,8 +13,8 @@ def img_save_path(type):
1313
for span in spans:
1414
span_type = span['type']
1515
if span_type == ContentType.Image:
16-
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'))
16+
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
1717
elif span_type == ContentType.Table:
18-
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'))
18+
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
1919

2020
return spans

0 commit comments

Comments
 (0)