diff --git a/magic_pdf/pipeline_txt.py b/magic_pdf/pipeline_txt.py index 8d147c81..d5f23c1c 100644 --- a/magic_pdf/pipeline_txt.py +++ b/magic_pdf/pipeline_txt.py @@ -2,17 +2,19 @@ 文本型pdf转化为统一清洗格式 """ - +# TODO 移动到spark/目录下 from loguru import logger -from magic_pdf.dict2md.mkcontent import mk_universal_format +from magic_pdf.dict2md.mkcontent import mk_mm_markdown, mk_universal_format from magic_pdf.libs.commons import join_path from magic_pdf.libs.json_compressor import JsonCompressor from magic_pdf.spark.base import exception_handler, get_data_source def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict: - + """ + 变成统一的标准格式 + """ if debug_mode: pass else: # 如果debug没开,则检测是否有needdrop字段 @@ -35,3 +37,32 @@ def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict: except Exception as e: jso = exception_handler(jso, e) return jso + + +def txt_pdf_to_mm_markdown_format(jso: dict, debug_mode=False) -> dict: + """ + 变成多模态的markdown格式 + """ + if debug_mode: + pass + else: # 如果debug没开,则检测是否有needdrop字段 + if jso.get("need_drop", False): + book_name = join_path(get_data_source(jso), jso["file_id"]) + logger.info(f"book_name is:{book_name} need drop") + jso["dropped"] = True + return jso + try: + pdf_intermediate_dict = jso["pdf_intermediate_dict"] + # 将 pdf_intermediate_dict 解压 + pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) + standard_format = mk_universal_format(pdf_intermediate_dict) + mm_content = mk_mm_markdown(standard_format) + jso["content_list"] = mm_content + logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",) + # 把无用的信息清空 + jso["doc_layout_result"] = "" + jso["pdf_intermediate_dict"] = "" + jso["pdf_meta"] = "" + except Exception as e: + jso = exception_handler(jso, e) + return jso \ No newline at end of file