|
| 1 | +import json |
| 2 | + |
| 3 | +from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio |
| 4 | +from magic_pdf.libs.ocr_dict_merge import merge_spans |
| 5 | + |
| 6 | + |
| 7 | +def read_json_file(file_path): |
| 8 | + with open(file_path, 'r') as f: |
| 9 | + data = json.load(f) |
| 10 | + return data |
| 11 | + |
| 12 | + |
| 13 | +def construct_page_component(page_id, text_blocks_preproc): |
| 14 | + return_dict = { |
| 15 | + 'preproc_blocks': text_blocks_preproc, |
| 16 | + 'page_idx': page_id |
| 17 | + } |
| 18 | + return return_dict |
| 19 | + |
| 20 | + |
| 21 | +def parse_pdf_by_ocr( |
| 22 | + ocr_json_file_path, |
| 23 | + start_page_id=0, |
| 24 | + end_page_id=None, |
| 25 | +): |
| 26 | + ocr_pdf_info = read_json_file(ocr_json_file_path) |
| 27 | + pdf_info_dict = {} |
| 28 | + end_page_id = end_page_id if end_page_id else len(ocr_pdf_info) - 1 |
| 29 | + for page_id in range(start_page_id, end_page_id + 1): |
| 30 | + ocr_page_info = ocr_pdf_info[page_id] |
| 31 | + layout_dets = ocr_page_info['layout_dets'] |
| 32 | + spans = [] |
| 33 | + for layout_det in layout_dets: |
| 34 | + category_id = layout_det['category_id'] |
| 35 | + allow_category_id_list = [13, 14, 15] |
| 36 | + if category_id in allow_category_id_list: |
| 37 | + x0, y0, _, _, x1, y1, _, _ = layout_det['poly'] |
| 38 | + bbox = [int(x0), int(y0), int(x1), int(y1)] |
| 39 | + # 13: 'embedding', # 嵌入公式 |
| 40 | + # 14: 'isolated', # 单行公式 |
| 41 | + # 15: 'ocr_text', # ocr识别文本 |
| 42 | + span = { |
| 43 | + 'bbox': bbox, |
| 44 | + } |
| 45 | + if category_id == 13: |
| 46 | + span['content'] = layout_det['latex'] |
| 47 | + span['type'] = 'inline_equation' |
| 48 | + elif category_id == 14: |
| 49 | + span['content'] = layout_det['latex'] |
| 50 | + span['type'] = 'displayed_equation' |
| 51 | + elif category_id == 15: |
| 52 | + span['content'] = layout_det['text'] |
| 53 | + span['type'] = 'text' |
| 54 | + # print(span) |
| 55 | + spans.append(span) |
| 56 | + else: |
| 57 | + continue |
| 58 | + |
| 59 | + # 合并重叠的spans |
| 60 | + for span1 in spans.copy(): |
| 61 | + for span2 in spans.copy(): |
| 62 | + if span1 != span2: |
| 63 | + overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.8) |
| 64 | + if overlap_box is not None: |
| 65 | + bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None) |
| 66 | + if bbox_to_remove is not None: |
| 67 | + spans.remove(bbox_to_remove) |
| 68 | + |
| 69 | + # 将spans合并成line |
| 70 | + lines = merge_spans(spans) |
| 71 | + |
| 72 | + # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox |
| 73 | + blocks = [] |
| 74 | + for line in lines: |
| 75 | + blocks.append({ |
| 76 | + "bbox": line['bbox'], |
| 77 | + "lines": [line], |
| 78 | + }) |
| 79 | + |
| 80 | + # 构造pdf_info_dict |
| 81 | + page_info = construct_page_component(page_id, blocks) |
| 82 | + pdf_info_dict[f"page_{page_id}"] = page_info |
| 83 | + |
| 84 | + return pdf_info_dict |
| 85 | + |
0 commit comments