Skip to content

Commit

Permalink
data_type/bookid/data_source兼容处理
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Mar 14, 2024
1 parent 26c2378 commit 8a52ada
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 9 deletions.
2 changes: 1 addition & 1 deletion magic_pdf/libs/draw_bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def draw_layout_bbox(pdf_info_dict, input_path, out_path):
for layout in page['layout_bboxes']:
page_layout_list.append(layout['layout_bbox'])
layout_bbox_list.append(page_layout_list)
for drop_tag, dropped_bboxes in page['dropped_bboxes'].items():
for drop_tag, dropped_bboxes in page['droped_bboxes'].items():
for dropped_bbox in dropped_bboxes:
page_dropped_list.append(dropped_bbox)
dropped_bbox_list.append(page_dropped_list)
Expand Down
41 changes: 33 additions & 8 deletions magic_pdf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,27 @@ def exception_handler(jso: dict, e):
return jso


def get_data_type(jso: dict):
data_type = jso.get('data_type')
if data_type is None:
data_type = jso.get('file_type')
return data_type


def get_bookid(jso: dict):
book_id = jso.get('bookid')
if book_id is None:
book_id = jso.get('original_file_id')
return book_id


def get_data_source(jso: dict):
data_source = jso.get('data_source')
if data_source is None:
data_source = jso.get('file_source')
return data_source


def meta_scan(jso: dict, doc_layout_check=True) -> dict:
s3_pdf_path = jso.get('file_location')
s3_config = get_s3_config(s3_pdf_path)
Expand All @@ -32,7 +53,7 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict:
jso['drop_reason'] = DropReason.MISS_DOC_LAYOUT_RESULT
return jso
try:
data_source = jso.get('data_source')
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id

Expand Down Expand Up @@ -78,7 +99,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
# 开始正式逻辑
try:
pdf_meta = jso.get('pdf_meta')
data_source = jso.get('data_source')
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id
total_page = pdf_meta["total_page"]
Expand Down Expand Up @@ -140,11 +161,11 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
pass
else:# 如果debug没开,则检测是否有needdrop字段
if jso.get('need_drop', False):
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr)
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
jso["dropped"] = True
return jso
try:
data_source = jso.get('data_source')
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id
title = jso.get('title')
Expand Down Expand Up @@ -195,7 +216,7 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:

def drop_needdrop_pdf(jso: dict) -> dict:
if jso.get('need_drop', False):
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr)
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
jso["dropped"] = True
return jso

Expand All @@ -206,7 +227,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
pass
else:# 如果debug没开,则检测是否有needdrop字段
if jso.get('need_drop', False):
book_name = join_path(jso['data_source'], jso['file_id'])
book_name = join_path(get_data_source(jso), jso['file_id'])
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
jso["dropped"] = True
return jso
Expand All @@ -216,7 +237,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
jso["content"] = markdown_content
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
# 把无用的信息清空
jso["doc_layout_result"] = ""
jso["pdf_intermediate_dict"] = ""
Expand All @@ -237,7 +258,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
s3_pdf_path = jso.get('file_location')
s3_config = get_s3_config(s3_pdf_path)
model_output_json_list = jso.get('doc_layout_result')
data_source = jso.get('data_source')
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id

Expand Down Expand Up @@ -290,5 +311,9 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
return jso


def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
pass


if __name__ == "__main__":
pass

0 comments on commit 8a52ada

Please sign in to comment.