ocr模式更新spark pipeline

opendatalab · Mar 14, 2024 · 9bd6294 · 9bd6294
1 parent 8a52ada
commit 9bd6294
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 3 deletions.
diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -1,7 +1,7 @@
 from magic_pdf.libs.ocr_content_type import ContentType
 
 
-def mk_nlp_markdown(pdf_info_dict: dict):
+def ocr_mk_nlp_markdown(pdf_info_dict: dict):
     markdown = []
 
     for _, page_info in pdf_info_dict.items():
@@ -25,7 +25,7 @@ def mk_nlp_markdown(pdf_info_dict: dict):
     return '\n'.join(markdown)
 
 
-def mk_mm_markdown(pdf_info_dict: dict):
+def ocr_mk_mm_markdown(pdf_info_dict: dict):
 
     markdown = []
 

diff --git a/magic_pdf/pipeline.py b/magic_pdf/pipeline.py
@@ -3,6 +3,7 @@
 import time
 from urllib.parse import quote
 
+from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
 from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time
 from magic_pdf.libs.drop_reason import DropReason
 from magic_pdf.libs.json_compressor import JsonCompressor
@@ -13,6 +14,7 @@
 from loguru import logger
 
 from app.common.s3 import get_s3_config, get_s3_client
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
 
 
 def exception_handler(jso: dict, e):
@@ -312,7 +314,77 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
 
 
 def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
-    pass
+    # 检测debug开关
+    if debug_mode:
+        pass
+    else:  # 如果debug没开，则检测是否有needdrop字段
+        if jso.get('need_drop', False):
+            return jso
+
+    s3_pdf_path = jso.get('file_location')
+    s3_config = get_s3_config(s3_pdf_path)
+    model_output_json_list = jso.get('doc_layout_result')
+    data_source = get_data_source(jso)
+    file_id = jso.get('file_id')
+    book_name = data_source + "/" + file_id
+    try:
+        save_path = "s3://mllm-raw-media/pdf2md_img/"
+        image_s3_config = get_s3_config(save_path)
+        start_time = time.time()  # 记录开始时间
+        # 先打印一下book_name和解析开始的时间
+        logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
+        pdf_info_dict = parse_pdf_by_ocr(
+            s3_pdf_path,
+            s3_config,
+            model_output_json_list,
+            save_path,
+            book_name,
+            pdf_model_profile=None,
+            image_s3_config=image_s3_config,
+            start_page_id=start_page_id,
+            debug_mode=debug_mode
+        )
+        if pdf_info_dict.get('need_drop', False):  # 如果返回的字典里有need_drop，则提取drop_reason并跳过本次解析
+            jso['need_drop'] = True
+            jso['drop_reason'] = pdf_info_dict["drop_reason"]
+        else:  # 正常返回，将 pdf_info_dict 压缩并存储
+            pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
+            jso['pdf_intermediate_dict'] = pdf_info_dict
+        end_time = time.time()  # 记录完成时间
+        parse_time = int(end_time - start_time)  # 计算执行时间
+        # 解析完成后打印一下book_name和耗时
+        logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
+                    file=sys.stderr)
+        jso['parse_time'] = parse_time
+    except Exception as e:
+        jso = exception_handler(jso, e)
+    return jso
+
+
+def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
+
+    if debug_mode:
+        pass
+    else:  # 如果debug没开，则检测是否有needdrop字段
+        if jso.get('need_drop', False):
+            book_name = join_path(get_data_source(jso), jso['file_id'])
+            logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
+            jso["dropped"] = True
+            return jso
+    try:
+        pdf_intermediate_dict = jso['pdf_intermediate_dict']
+        # 将 pdf_intermediate_dict 解压
+        pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
+        markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
+        jso["content"] = markdown_content
+        logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
+        # 把无用的信息清空
+        jso["doc_layout_result"] = ""
+        jso["pdf_intermediate_dict"] = ""
+        jso["pdf_meta"] = ""
+    except Exception as e:
+        jso = exception_handler(jso, e)
+    return jso
 
 
 if __name__ == "__main__":