data_type/bookid/data_source兼容处理

opendatalab · Mar 14, 2024 · 8a52ada · 8a52ada
1 parent 26c2378
commit 8a52ada
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 9 deletions.
diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py
@@ -36,7 +36,7 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
         for layout in page['layout_bboxes']:
             page_layout_list.append(layout['layout_bbox'])
         layout_bbox_list.append(page_layout_list)
-        for drop_tag, dropped_bboxes in page['dropped_bboxes'].items():
+        for drop_tag, dropped_bboxes in page['droped_bboxes'].items():
             for dropped_bbox in dropped_bboxes:
                 page_dropped_list.append(dropped_bbox)
         dropped_bbox_list.append(page_dropped_list)

diff --git a/magic_pdf/pipeline.py b/magic_pdf/pipeline.py
@@ -23,6 +23,27 @@ def exception_handler(jso: dict, e):
     return jso
 
 
+def get_data_type(jso: dict):
+    data_type = jso.get('data_type')
+    if data_type is None:
+        data_type = jso.get('file_type')
+    return data_type
+
+
+def get_bookid(jso: dict):
+    book_id = jso.get('bookid')
+    if book_id is None:
+        book_id = jso.get('original_file_id')
+    return book_id
+
+
+def get_data_source(jso: dict):
+    data_source = jso.get('data_source')
+    if data_source is None:
+        data_source = jso.get('file_source')
+    return data_source
+
+
 def meta_scan(jso: dict, doc_layout_check=True) -> dict:
     s3_pdf_path = jso.get('file_location')
     s3_config = get_s3_config(s3_pdf_path)
@@ -32,7 +53,7 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict:
             jso['drop_reason'] = DropReason.MISS_DOC_LAYOUT_RESULT
             return jso
     try:
-        data_source = jso.get('data_source')
+        data_source = get_data_source(jso)
         file_id = jso.get('file_id')
         book_name = data_source + "/" + file_id
 
@@ -78,7 +99,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
     # 开始正式逻辑
     try:
         pdf_meta = jso.get('pdf_meta')
-        data_source = jso.get('data_source')
+        data_source = get_data_source(jso)
         file_id = jso.get('file_id')
         book_name = data_source + "/" + file_id
         total_page = pdf_meta["total_page"]
@@ -140,11 +161,11 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
         pass
     else:# 如果debug没开，则检测是否有needdrop字段
         if jso.get('need_drop', False):
-            logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr)
+            logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
             jso["dropped"] = True
             return jso
     try:
-        data_source = jso.get('data_source')
+        data_source = get_data_source(jso)
         file_id = jso.get('file_id')
         book_name = data_source + "/" + file_id
         title = jso.get('title')
@@ -195,7 +216,7 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
 
 def drop_needdrop_pdf(jso: dict) -> dict:
     if jso.get('need_drop', False):
-        logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr)
+        logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
         jso["dropped"] = True
     return jso
 
@@ -206,7 +227,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
         pass
     else:# 如果debug没开，则检测是否有needdrop字段
         if jso.get('need_drop', False):
-            book_name = join_path(jso['data_source'], jso['file_id'])
+            book_name = join_path(get_data_source(jso), jso['file_id'])
             logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
             jso["dropped"] = True
             return jso
@@ -216,7 +237,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
         pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
         markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
         jso["content"] = markdown_content
-        logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
+        logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
         # 把无用的信息清空
         jso["doc_layout_result"] = ""
         jso["pdf_intermediate_dict"] = ""
@@ -237,7 +258,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
     s3_pdf_path = jso.get('file_location')
     s3_config = get_s3_config(s3_pdf_path)
     model_output_json_list = jso.get('doc_layout_result')
-    data_source = jso.get('data_source')
+    data_source = get_data_source(jso)
     file_id = jso.get('file_id')
     book_name = data_source + "/" + file_id
 
@@ -290,5 +311,9 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
     return jso
 
 
+def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
+    pass
+
+
 if __name__ == "__main__":
     pass