|
| 1 | +class DropReason: |
| 2 | + TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序 |
| 3 | + USEFUL_BLOCK_HOR_OVERLAP = ( |
| 4 | + 'useful_block_horizontal_overlap' # 需保留的block水平覆盖 |
| 5 | + ) |
| 6 | + COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持 |
| 7 | + TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的 |
| 8 | + COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。 |
| 9 | + HIGH_COMPUTATIONAL_lOAD_BY_IMGS = ( |
| 10 | + 'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃 |
| 11 | + ) |
| 12 | + HIGH_COMPUTATIONAL_lOAD_BY_SVGS = ( |
| 13 | + 'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃 |
| 14 | + ) |
| 15 | + HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大 |
| 16 | + MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败 |
| 17 | + Exception = '_exception' # 解析中发生异常 |
| 18 | + ENCRYPTED = 'encrypted' # PDF是加密的 |
| 19 | + EMPTY_PDF = 'total_page=0' # PDF页面总数为0 |
| 20 | + NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析 |
| 21 | + DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段 |
| 22 | + TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败 |
| 23 | + TITLE_LEVEL_FAILED = ( |
| 24 | + 'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题) |
| 25 | + ) |
| 26 | + PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败 |
| 27 | + PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败 |
| 28 | + NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种 |
| 29 | + SPECIAL_PDF = 'special_pdf' |
| 30 | + PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏 |
| 31 | + CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面 |
| 32 | + NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负 |
| 33 | + OVERLAP_BLOCKS_CAN_NOT_SEPARATION = ( |
| 34 | + 'overlap_blocks_can_t_separation' # 无法分离重叠的block |
| 35 | + ) |
0 commit comments