Skip to content

Commit 4b8dbd7

Browse files
committed
ocr_pdf_intermediate_dict_to_markdown_with_para支持mm和nlp双模式
1 parent d6a5724 commit 4b8dbd7

File tree

1 file changed

+7
-3
lines changed

1 file changed

+7
-3
lines changed

magic_pdf/pipeline_ocr.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
4141
return jso
4242

4343

44-
def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False) -> dict:
44+
def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, mode, debug_mode=False) -> dict:
4545
if debug_mode:
4646
pass
4747
else: # 如果debug没开,则检测是否有needdrop字段
@@ -54,8 +54,12 @@ def ocr_pdf_intermediate_dict_to_markdown_with_para(jso: dict, debug_mode=False)
5454
pdf_intermediate_dict = jso["pdf_intermediate_dict"]
5555
# 将 pdf_intermediate_dict 解压
5656
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
57-
# markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
58-
markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
57+
58+
if mode == "mm":
59+
markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict)
60+
elif mode == "nlp":
61+
markdown_content = ocr_mk_nlp_markdown_with_para(pdf_intermediate_dict)
62+
5963
jso["content"] = markdown_content
6064
logger.info(
6165
f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}",

0 commit comments

Comments
 (0)