-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a343175
commit 7162deb
Showing
3 changed files
with
59 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
""" | ||
文本型pdf转化为统一清洗格式 | ||
""" | ||
|
||
|
||
|
||
from loguru import logger | ||
from magic_pdf.dict2md.mkcontent import mk_universal_format | ||
from magic_pdf.libs.commons import join_path | ||
from magic_pdf.libs.json_compressor import JsonCompressor | ||
from magic_pdf.spark.base import exception_handler, get_data_source | ||
|
||
|
||
def txt_pdf_to_standard_format(jso: dict, debug_mode=False) -> dict: | ||
|
||
if debug_mode: | ||
pass | ||
else: # 如果debug没开,则检测是否有needdrop字段 | ||
if jso.get("need_drop", False): | ||
book_name = join_path(get_data_source(jso), jso["file_id"]) | ||
logger.info(f"book_name is:{book_name} need drop") | ||
jso["dropped"] = True | ||
return jso | ||
try: | ||
pdf_intermediate_dict = jso["pdf_intermediate_dict"] | ||
# 将 pdf_intermediate_dict 解压 | ||
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) | ||
standard_format = mk_universal_format(pdf_intermediate_dict) | ||
jso["content_list"] = standard_format | ||
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}",) | ||
# 把无用的信息清空 | ||
jso["doc_layout_result"] = "" | ||
jso["pdf_intermediate_dict"] = "" | ||
jso["pdf_meta"] = "" | ||
except Exception as e: | ||
jso = exception_handler(jso, e) | ||
return jso |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
|
||
|
||
from loguru import logger | ||
|
||
from magic_pdf.libs.drop_reason import DropReason | ||
|
||
|
||
def get_data_source(jso: dict): | ||
data_source = jso.get("data_source") | ||
if data_source is None: | ||
data_source = jso.get("file_source") | ||
return data_source | ||
|
||
|
||
def exception_handler(jso: dict, e): | ||
logger.exception(e) | ||
jso["need_drop"] = True | ||
jso["drop_reason"] = DropReason.Exception | ||
jso["exception"] = f"ERROR: {e}" | ||
return jso | ||
|