Skip to content

Commit

Permalink
book_name生成逻辑更新
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Mar 15, 2024
1 parent 8486793 commit b1ac8d0
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions magic_pdf/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict:
try:
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id
book_name = f"{data_source}/{file_id}"

# 首页存在超量drawing问题
# special_pdf_list = ['zlib/zlib_21822650']
Expand Down Expand Up @@ -103,7 +103,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
pdf_meta = jso.get('pdf_meta')
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id
book_name = f"{data_source}/{file_id}"
total_page = pdf_meta["total_page"]
page_width = pdf_meta["page_width_pts"]
page_height = pdf_meta["page_height_pts"]
Expand Down Expand Up @@ -169,7 +169,7 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
try:
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id
book_name = f"{data_source}/{file_id}"
title = jso.get('title')
url_encode_title = quote(title, safe='')
if data_source != 'scihub':
Expand Down Expand Up @@ -262,7 +262,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
model_output_json_list = jso.get('doc_layout_result')
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id
book_name = f"{data_source}/{file_id}"

# 1.23.22已修复
# if debug_mode:
Expand Down Expand Up @@ -326,7 +326,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
model_output_json_list = jso.get('doc_layout_result')
data_source = get_data_source(jso)
file_id = jso.get('file_id')
book_name = data_source + "/" + file_id
book_name = f"{data_source}/{file_id}"
try:
save_path = "s3://mllm-raw-media/pdf2md_img/"
image_s3_config = get_s3_config(save_path)
Expand Down

0 comments on commit b1ac8d0

Please sign in to comment.