|
4 | 4 | from loguru import logger
|
5 | 5 | from pathlib import Path
|
6 | 6 |
|
| 7 | +from app.common.s3 import get_s3_config |
7 | 8 | from demo.demo_test import get_json_from_local_or_s3
|
8 |
| -from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para, ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format |
| 9 | +from magic_pdf.dict2md.ocr_mkcontent import ( |
| 10 | + ocr_mk_mm_markdown_with_para, |
| 11 | + ocr_mk_nlp_markdown, |
| 12 | + ocr_mk_mm_markdown, |
| 13 | + ocr_mk_mm_standard_format, |
| 14 | + ocr_mk_mm_markdown_with_para_and_pagination, |
| 15 | + make_standard_format_with_para |
| 16 | +) |
9 | 17 | from magic_pdf.libs.commons import join_path
|
10 | 18 | from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
11 | 19 |
|
@@ -35,50 +43,59 @@ def ocr_local_parse(ocr_pdf_path, ocr_json_file_path):
|
35 | 43 | ocr_pdf_model_info = read_json_file(ocr_json_file_path)
|
36 | 44 | pth = Path(ocr_json_file_path)
|
37 | 45 | book_name = pth.name
|
38 |
| - save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest") |
39 |
| - save_path = join_path(save_tmp_path, "md") |
40 |
| - save_path_with_bookname = os.path.join(save_path, book_name) |
41 |
| - text_content_save_path = f"{save_path_with_bookname}/book.md" |
42 |
| - pdf_info_dict = parse_pdf_by_ocr( |
43 |
| - ocr_pdf_path, |
44 |
| - None, |
45 |
| - ocr_pdf_model_info, |
46 |
| - save_path, |
47 |
| - book_name, |
48 |
| - debug_mode=True) |
49 |
| - |
50 |
| - parent_dir = os.path.dirname(text_content_save_path) |
51 |
| - if not os.path.exists(parent_dir): |
52 |
| - os.makedirs(parent_dir) |
53 |
| - |
54 |
| - # markdown_content = mk_nlp_markdown(pdf_info_dict) |
55 |
| - markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict) |
56 |
| - |
57 |
| - with open(text_content_save_path, "w", encoding="utf-8") as f: |
58 |
| - f.write(markdown_content) |
59 |
| - |
60 |
| - standard_format = ocr_mk_mm_standard_format(pdf_info_dict) |
61 |
| - standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt" |
62 |
| - with open(standard_format_save_path, "w", encoding="utf-8") as f: |
63 |
| - f.write(str(standard_format)) |
64 |
| - |
65 |
| - # logger.info(markdown_content) |
66 |
| - # save_markdown(markdown_text, ocr_json_file_path) |
| 46 | + ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info) |
67 | 47 | except Exception as e:
|
68 | 48 | logger.exception(e)
|
69 | 49 |
|
70 | 50 |
|
71 | 51 | def ocr_online_parse(book_name, start_page_id=0, debug_mode=True):
|
72 |
| - json_object = get_json_from_local_or_s3(book_name) |
73 |
| - logger.info(json_object) |
| 52 | + try: |
| 53 | + json_object = get_json_from_local_or_s3(book_name) |
| 54 | + # logger.info(json_object) |
| 55 | + s3_pdf_path = json_object["file_location"] |
| 56 | + s3_config = get_s3_config(s3_pdf_path) |
| 57 | + ocr_pdf_model_info = json_object.get("doc_layout_result") |
| 58 | + ocr_parse_core(book_name, s3_pdf_path, ocr_pdf_model_info, s3_config=s3_config) |
| 59 | + except Exception as e: |
| 60 | + logger.exception(e) |
| 61 | + |
| 62 | + |
| 63 | +def ocr_parse_core(book_name, ocr_pdf_path, ocr_pdf_model_info, start_page_id=0, s3_config=None): |
| 64 | + save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest") |
| 65 | + save_path = join_path(save_tmp_path, "md") |
| 66 | + save_path_with_bookname = os.path.join(save_path, book_name) |
| 67 | + text_content_save_path = f"{save_path_with_bookname}/book.md" |
| 68 | + pdf_info_dict = parse_pdf_by_ocr( |
| 69 | + ocr_pdf_path, |
| 70 | + s3_config, |
| 71 | + ocr_pdf_model_info, |
| 72 | + save_path, |
| 73 | + book_name, |
| 74 | + debug_mode=True) |
| 75 | + |
| 76 | + parent_dir = os.path.dirname(text_content_save_path) |
| 77 | + if not os.path.exists(parent_dir): |
| 78 | + os.makedirs(parent_dir) |
| 79 | + |
| 80 | + # markdown_content = mk_nlp_markdown(pdf_info_dict) |
| 81 | + markdown_content = ocr_mk_mm_markdown_with_para(pdf_info_dict) |
| 82 | + # markdown_pagination = ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict) |
| 83 | + |
| 84 | + with open(text_content_save_path, "w", encoding="utf-8") as f: |
| 85 | + f.write(markdown_content) |
| 86 | + |
| 87 | + standard_format = make_standard_format_with_para(pdf_info_dict) |
| 88 | + standard_format_save_path = f"{save_path_with_bookname}/standard_format.txt" |
| 89 | + with open(standard_format_save_path, "w", encoding="utf-8") as f: |
| 90 | + # 将standard_format dump成json文本并保存 |
| 91 | + f.write(json.dumps(standard_format, ensure_ascii=False)) |
| 92 | + |
74 | 93 |
|
75 | 94 | if __name__ == '__main__':
|
76 |
| - #ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf" |
77 |
| - #ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json" |
78 |
| - # ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf" |
79 |
| - # ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json" |
80 |
| - ocr_pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.pdf" |
81 |
| - ocr_json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/ocr_1.json" |
82 |
| - ocr_online_parse(book_name="数学新星网/edu_00001236") |
83 |
| - ocr_local_parse(ocr_pdf_path, ocr_json_file_path) |
| 95 | + pdf_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.pdf" |
| 96 | + json_file_path = r"/home/cxu/workspace/Magic-PDF/ocr_demo/j.1540-627x.2006.00176.x.json" |
| 97 | + # ocr_local_parse(pdf_path, json_file_path) |
| 98 | + book_name = "科数网/edu_00011318" |
| 99 | + ocr_online_parse(book_name) |
| 100 | + |
84 | 101 | pass
|
0 commit comments