|
1 |
| -import os |
2 |
| -import json |
3 |
| -import copy |
4 |
| - |
5 |
| -from loguru import logger |
6 |
| - |
7 |
| -from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox |
8 |
| -from magic_pdf.pipe.UNIPipe import UNIPipe |
9 |
| -from magic_pdf.pipe.OCRPipe import OCRPipe |
10 |
| -from magic_pdf.pipe.TXTPipe import TXTPipe |
11 |
| -from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter |
12 |
| - |
13 |
| - |
14 |
| -# todo: 设备类型选择 (?) |
15 |
| - |
16 |
| -def json_md_dump( |
17 |
| - pipe, |
18 |
| - md_writer, |
19 |
| - pdf_name, |
20 |
| - content_list, |
21 |
| - md_content, |
22 |
| - orig_model_list, |
23 |
| -): |
24 |
| - # 写入模型结果到 model.json |
25 |
| - |
26 |
| - md_writer.write( |
27 |
| - content=json.dumps(orig_model_list, ensure_ascii=False, indent=4), |
28 |
| - path=f"{pdf_name}_model.json" |
29 |
| - ) |
30 |
| - |
31 |
| - # 写入中间结果到 middle.json |
32 |
| - md_writer.write( |
33 |
| - content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), |
34 |
| - path=f"{pdf_name}_middle.json" |
35 |
| - ) |
36 |
| - |
37 |
| - # text文本结果写入到 conent_list.json |
38 |
| - md_writer.write( |
39 |
| - content=json.dumps(content_list, ensure_ascii=False, indent=4), |
40 |
| - path=f"{pdf_name}_content_list.json" |
41 |
| - ) |
42 |
| - |
43 |
| - # 写入结果到 .md 文件中 |
44 |
| - md_writer.write( |
45 |
| - content=md_content, |
46 |
| - path=f"{pdf_name}.md" |
47 |
| - ) |
48 |
| - |
49 |
| - |
50 |
| -# 可视化 |
51 |
| -def draw_visualization_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name): |
52 |
| - # 画布局框,附带排序结果 |
53 |
| - draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) |
54 |
| - # 画 span 框 |
55 |
| - draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) |
56 |
| - |
57 |
| - |
58 |
| -def pdf_parse_main( |
59 |
| - pdf_path: str, |
60 |
| - parse_method: str = 'auto', |
61 |
| - model_json_path: str = None, |
62 |
| - is_json_md_dump: bool = True, |
63 |
| - is_draw_visualization_bbox: bool = True, |
64 |
| - output_dir: str = None |
65 |
| -): |
66 |
| - """ |
67 |
| - 执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录 |
68 |
| -
|
69 |
| - :param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径 |
70 |
| - :param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr |
71 |
| - :param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应 |
72 |
| - :param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中 |
73 |
| - :param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果 |
74 |
| - """ |
75 |
| - try: |
76 |
| - pdf_name = os.path.basename(pdf_path).split(".")[0] |
77 |
| - pdf_path_parent = os.path.dirname(pdf_path) |
78 |
| - |
79 |
| - if output_dir: |
80 |
| - output_path = os.path.join(output_dir, pdf_name) |
81 |
| - else: |
82 |
| - output_path = os.path.join(pdf_path_parent, pdf_name) |
83 |
| - |
84 |
| - output_image_path = os.path.join(output_path, 'images') |
85 |
| - |
86 |
| - # 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中 |
87 |
| - image_path_parent = os.path.basename(output_image_path) |
88 |
| - |
89 |
| - pdf_bytes = open(pdf_path, "rb").read() # 读取 pdf 文件的二进制数据 |
90 |
| - |
91 |
| - orig_model_list = [] |
92 |
| - |
93 |
| - if model_json_path: |
94 |
| - # 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型 |
95 |
| - model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read()) |
96 |
| - orig_model_list = copy.deepcopy(model_json) |
97 |
| - else: |
98 |
| - model_json = [] |
99 |
| - |
100 |
| - # 执行解析步骤 |
101 |
| - # image_writer = DiskReaderWriter(output_image_path) |
102 |
| - image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path) |
103 |
| - |
104 |
| - # 选择解析方式 |
105 |
| - # jso_useful_key = {"_pdf_type": "", "model_list": model_json} |
106 |
| - # pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) |
107 |
| - if parse_method == "auto": |
108 |
| - jso_useful_key = {"_pdf_type": "", "model_list": model_json} |
109 |
| - pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) |
110 |
| - elif parse_method == "txt": |
111 |
| - pipe = TXTPipe(pdf_bytes, model_json, image_writer) |
112 |
| - elif parse_method == "ocr": |
113 |
| - pipe = OCRPipe(pdf_bytes, model_json, image_writer) |
114 |
| - else: |
115 |
| - logger.error("unknown parse method, only auto, ocr, txt allowed") |
116 |
| - exit(1) |
117 |
| - |
118 |
| - # 执行分类 |
119 |
| - pipe.pipe_classify() |
120 |
| - |
121 |
| - # 如果没有传入模型数据,则使用内置模型解析 |
122 |
| - if len(model_json) == 0: |
123 |
| - pipe.pipe_analyze() # 解析 |
124 |
| - orig_model_list = copy.deepcopy(pipe.model_list) |
125 |
| - |
126 |
| - # 执行解析 |
127 |
| - pipe.pipe_parse() |
128 |
| - |
129 |
| - # 保存 text 和 md 格式的结果 |
130 |
| - content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none") |
131 |
| - md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none") |
132 |
| - |
133 |
| - if is_json_md_dump: |
134 |
| - json_md_dump(pipe, md_writer, pdf_name, content_list, md_content, orig_model_list) |
135 |
| - |
136 |
| - if is_draw_visualization_bbox: |
137 |
| - draw_visualization_bbox(pipe.pdf_mid_data['pdf_info'], pdf_bytes, output_path, pdf_name) |
138 |
| - |
139 |
| - except Exception as e: |
140 |
| - logger.exception(e) |
141 |
| - |
142 |
| - |
143 |
| -# 测试 |
144 |
| -if __name__ == '__main__': |
145 |
| - pdf_path = r"D:\project\20240617magicpdf\Magic-PDF\demo\demo1.pdf" |
146 |
| - pdf_parse_main(pdf_path) |
| 1 | +import copy |
| 2 | +import json |
| 3 | +import os |
| 4 | + |
| 5 | +from loguru import logger |
| 6 | + |
| 7 | +from magic_pdf.data.data_reader_writer import FileBasedDataWriter |
| 8 | +from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox |
| 9 | +from magic_pdf.pipe.OCRPipe import OCRPipe |
| 10 | +from magic_pdf.pipe.TXTPipe import TXTPipe |
| 11 | +from magic_pdf.pipe.UNIPipe import UNIPipe |
| 12 | + |
| 13 | +# todo: 设备类型选择 (?) |
| 14 | + |
| 15 | + |
| 16 | +def json_md_dump( |
| 17 | + pipe, |
| 18 | + md_writer, |
| 19 | + pdf_name, |
| 20 | + content_list, |
| 21 | + md_content, |
| 22 | + orig_model_list, |
| 23 | +): |
| 24 | + # 写入模型结果到 model.json |
| 25 | + |
| 26 | + md_writer.write_string( |
| 27 | + f'{pdf_name}_model.json', |
| 28 | + json.dumps(orig_model_list, ensure_ascii=False, indent=4) |
| 29 | + ) |
| 30 | + |
| 31 | + # 写入中间结果到 middle.json |
| 32 | + md_writer.write_string( |
| 33 | + f'{pdf_name}_middle.json', |
| 34 | + json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4) |
| 35 | + ) |
| 36 | + |
| 37 | + # text文本结果写入到 conent_list.json |
| 38 | + md_writer.write_string( |
| 39 | + f'{pdf_name}_content_list.json', |
| 40 | + json.dumps(content_list, ensure_ascii=False, indent=4) |
| 41 | + ) |
| 42 | + |
| 43 | + # 写入结果到 .md 文件中 |
| 44 | + md_writer.write_string( |
| 45 | + f'{pdf_name}.md', |
| 46 | + md_content, |
| 47 | + ) |
| 48 | + |
| 49 | + |
| 50 | +# 可视化 |
| 51 | +def draw_visualization_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name): |
| 52 | + # 画布局框,附带排序结果 |
| 53 | + draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) |
| 54 | + # 画 span 框 |
| 55 | + draw_span_bbox(pdf_info, pdf_bytes, local_md_dir, pdf_file_name) |
| 56 | + |
| 57 | + |
| 58 | +def pdf_parse_main( |
| 59 | + pdf_path: str, |
| 60 | + parse_method: str = 'auto', |
| 61 | + model_json_path: str = None, |
| 62 | + is_json_md_dump: bool = True, |
| 63 | + is_draw_visualization_bbox: bool = True, |
| 64 | + output_dir: str = None |
| 65 | +): |
| 66 | + """执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录. |
| 67 | +
|
| 68 | + :param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径 |
| 69 | + :param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto,如果效果不好,可以尝试 ocr |
| 70 | + :param model_json_path: 已经存在的模型数据文件,如果为空则使用内置模型,pdf 和 model_json 务必对应 |
| 71 | + :param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True,会将不同阶段的数据写入到不同的 .json 文件中(共3个.json文件),md内容会保存到 .md 文件中 |
| 72 | + :param is_draw_visualization_bbox: 是否绘制可视化边界框,默认 True,会生成布局框和 span 框的图像 |
| 73 | + :param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果 |
| 74 | + """ |
| 75 | + try: |
| 76 | + pdf_name = os.path.basename(pdf_path).split('.')[0] |
| 77 | + pdf_path_parent = os.path.dirname(pdf_path) |
| 78 | + |
| 79 | + if output_dir: |
| 80 | + output_path = os.path.join(output_dir, pdf_name) |
| 81 | + else: |
| 82 | + output_path = os.path.join(pdf_path_parent, pdf_name) |
| 83 | + |
| 84 | + output_image_path = os.path.join(output_path, 'images') |
| 85 | + |
| 86 | + # 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中 |
| 87 | + image_path_parent = os.path.basename(output_image_path) |
| 88 | + |
| 89 | + pdf_bytes = open(pdf_path, 'rb').read() # 读取 pdf 文件的二进制数据 |
| 90 | + |
| 91 | + orig_model_list = [] |
| 92 | + |
| 93 | + if model_json_path: |
| 94 | + # 读取已经被模型解析后的pdf文件的 json 原始数据,list 类型 |
| 95 | + model_json = json.loads(open(model_json_path, 'r', encoding='utf-8').read()) |
| 96 | + orig_model_list = copy.deepcopy(model_json) |
| 97 | + else: |
| 98 | + model_json = [] |
| 99 | + |
| 100 | + # 执行解析步骤 |
| 101 | + image_writer, md_writer = FileBasedDataWriter(output_image_path), FileBasedDataWriter(output_path) |
| 102 | + |
| 103 | + # 选择解析方式 |
| 104 | + if parse_method == 'auto': |
| 105 | + jso_useful_key = {'_pdf_type': '', 'model_list': model_json} |
| 106 | + pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) |
| 107 | + elif parse_method == 'txt': |
| 108 | + pipe = TXTPipe(pdf_bytes, model_json, image_writer) |
| 109 | + elif parse_method == 'ocr': |
| 110 | + pipe = OCRPipe(pdf_bytes, model_json, image_writer) |
| 111 | + else: |
| 112 | + logger.error('unknown parse method, only auto, ocr, txt allowed') |
| 113 | + exit(1) |
| 114 | + |
| 115 | + # 执行分类 |
| 116 | + pipe.pipe_classify() |
| 117 | + |
| 118 | + # 如果没有传入模型数据,则使用内置模型解析 |
| 119 | + if len(model_json) == 0: |
| 120 | + pipe.pipe_analyze() # 解析 |
| 121 | + orig_model_list = copy.deepcopy(pipe.model_list) |
| 122 | + |
| 123 | + # 执行解析 |
| 124 | + pipe.pipe_parse() |
| 125 | + |
| 126 | + # 保存 text 和 md 格式的结果 |
| 127 | + content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode='none') |
| 128 | + md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode='none') |
| 129 | + |
| 130 | + if is_json_md_dump: |
| 131 | + json_md_dump(pipe, md_writer, pdf_name, content_list, md_content, orig_model_list) |
| 132 | + |
| 133 | + if is_draw_visualization_bbox: |
| 134 | + draw_visualization_bbox(pipe.pdf_mid_data['pdf_info'], pdf_bytes, output_path, pdf_name) |
| 135 | + |
| 136 | + except Exception as e: |
| 137 | + logger.exception(e) |
| 138 | + |
| 139 | + |
| 140 | +# 测试 |
| 141 | +if __name__ == '__main__': |
| 142 | + current_script_dir = os.path.dirname(os.path.abspath(__file__)) |
| 143 | + demo_names = ['demo1', 'demo2', 'small_ocr'] |
| 144 | + for name in demo_names: |
| 145 | + file_path = os.path.join(current_script_dir, f'{name}.pdf') |
| 146 | + pdf_parse_main(file_path) |
0 commit comments