Skip to content

Commit 158e556

Browse files
authored
Merge pull request #1063 from opendatalab/release-0.10.0
Release 0.10.0
2 parents 038f48d + 30be501 commit 158e556

File tree

110 files changed

+25715
-2268
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

110 files changed

+25715
-2268
lines changed

.github/workflows/cli.yml

-7
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,6 @@
33

44
name: mineru
55
on:
6-
push:
7-
branches:
8-
- "master"
9-
- "dev"
10-
paths-ignore:
11-
- "cmds/**"
12-
- "**.md"
136
pull_request:
147
branches:
158
- "master"

.github/workflows/daily.yml

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ jobs:
2020
source activate mineru
2121
conda env list
2222
pip show coverage
23+
git checkout "dev"
2324
# cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
2425
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
2526
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing

.github/workflows/huigui.yml

-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ on:
1010
paths-ignore:
1111
- "cmds/**"
1212
- "**.md"
13-
workflow_dispatch:
1413
jobs:
1514
cli-test:
1615
if: github.repository == 'opendatalab/MinerU'

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
</div>
4343

4444
# Changelog
45+
- 2024/11/22 0.10.0 released. Introducing hybrid OCR text extraction capabilities,
46+
- Significantly improved parsing performance in complex text distribution scenarios such as dense formulas, irregular span regions, and text represented by images.
47+
- Combines the dual advantages of accurate content extraction and faster speed in text mode, and more precise span/line region recognition in OCR mode.
4548
- 2024/11/15 0.9.3 released. Integrated [RapidTable](https://github.com/RapidAI/RapidTable) for table recognition, improving single-table parsing speed by more than 10 times, with higher accuracy and lower GPU memory usage.
4649
- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
4750
- 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:

README_zh-CN.md

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@
4242
</div>
4343

4444
# 更新记录
45+
- 2024/11/22 0.10.0发布,通过引入混合OCR文本提取能力,
46+
- 在公式密集、span区域不规范、部分文本使用图像表现等复杂文本分布场景下获得解析效果的显著提升
47+
- 同时具备文本模式内容提取准确、速度更快与OCR模式span/line区域识别更准的双重优势
4548
- 2024/11/15 0.9.3发布,为表格识别功能接入了[RapidTable](https://github.com/RapidAI/RapidTable),单表解析速度提升10倍以上,准确率更高,显存占用更低
4649
- 2024/11/06 0.9.2发布,为表格识别功能接入了[StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B)模型
4750
- 2024/10/31 0.9.0发布,这是我们进行了大量代码重构的全新版本,解决了众多问题,提升了性能,降低了硬件需求,并提供了更丰富的易用性:

magic_pdf/config/constants.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""span维度自定义字段."""
2+
# span是否是跨页合并的
3+
CROSS_PAGE = 'cross_page'
4+
5+
"""
6+
block维度自定义字段
7+
"""
8+
# block中lines是否被删除
9+
LINES_DELETED = 'lines_deleted'
10+
11+
# table recognition max time default value
12+
TABLE_MAX_TIME_VALUE = 400
13+
14+
# pp_table_result_max_length
15+
TABLE_MAX_LEN = 480
16+
17+
# table master structure dict
18+
TABLE_MASTER_DICT = 'table_master_structure_dict.txt'
19+
20+
# table master dir
21+
TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'
22+
23+
# pp detect model dir
24+
DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'
25+
26+
# pp rec model dir
27+
REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'
28+
29+
# pp rec char dict path
30+
REC_CHAR_DICT = 'ppocr_keys_v1.txt'
31+
32+
# pp rec copy rec directory
33+
PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
34+
35+
# pp rec copy det directory
36+
PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
37+
38+
39+
class MODEL_NAME:
40+
# pp table structure algorithm
41+
TABLE_MASTER = 'tablemaster'
42+
# struct eqtable
43+
STRUCT_EQTABLE = 'struct_eqtable'
44+
45+
DocLayout_YOLO = 'doclayout_yolo'
46+
47+
LAYOUTLMv3 = 'layoutlmv3'
48+
49+
YOLO_V8_MFD = 'yolo_v8_mfd'
50+
51+
UniMerNet_v2_Small = 'unimernet_small'
52+
53+
RAPID_TABLE = 'rapid_table'

magic_pdf/config/drop_reason.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
class DropReason:
2+
TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序
3+
USEFUL_BLOCK_HOR_OVERLAP = (
4+
'useful_block_horizontal_overlap' # 需保留的block水平覆盖
5+
)
6+
COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持
7+
TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的
8+
COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
9+
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
10+
'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃
11+
)
12+
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
13+
'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃
14+
)
15+
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大
16+
MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败
17+
Exception = '_exception' # 解析中发生异常
18+
ENCRYPTED = 'encrypted' # PDF是加密的
19+
EMPTY_PDF = 'total_page=0' # PDF页面总数为0
20+
NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析
21+
DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段
22+
TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败
23+
TITLE_LEVEL_FAILED = (
24+
'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题)
25+
)
26+
PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败
27+
PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败
28+
NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种
29+
SPECIAL_PDF = 'special_pdf'
30+
PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏
31+
CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面
32+
NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负
33+
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
34+
'overlap_blocks_can_t_separation' # 无法分离重叠的block
35+
)

magic_pdf/config/drop_tag.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
2+
COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
3+
PAGE_NO = 'page-no' # 页码
4+
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
5+
VERTICAL_TEXT = 'vertical-text' # 垂直文本
6+
ROTATE_TEXT = 'rotate-text' # 旋转文本
7+
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
8+
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
9+
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
10+
11+
12+
class DropTag:
13+
PAGE_NUMBER = 'page_no'
14+
HEADER = 'header'
15+
FOOTER = 'footer'
16+
FOOTNOTE = 'footnote'
17+
NOT_IN_LAYOUT = 'not_in_layout'
18+
SPAN_OVERLAP = 'span_overlap'
19+
BLOCK_OVERLAP = 'block_overlap'
+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
class MakeMode:
2+
MM_MD = 'mm_markdown'
3+
NLP_MD = 'nlp_markdown'
4+
STANDARD_FORMAT = 'standard_format'
5+
6+
7+
class DropMode:
8+
WHOLE_PDF = 'whole_pdf'
9+
SINGLE_PAGE = 'single_page'
10+
NONE = 'none'
11+
NONE_WITH_REASON = 'none_with_reason'
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from enum import Enum
22

3+
34
class ModelBlockTypeEnum(Enum):
45
TITLE = 0
56
PLAIN_TEXT = 1
67
ABANDON = 2
78
ISOLATE_FORMULA = 8
89
EMBEDDING = 13
9-
ISOLATED = 14
10+
ISOLATED = 14

magic_pdf/data/read_api.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def read_jsonl(
3535
jsonl_d = [
3636
json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
3737
]
38-
for d in jsonl_d[:5]:
38+
for d in jsonl_d:
3939
pdf_path = d.get('file_location', '') or d.get('path', '')
4040
if len(pdf_path) == 0:
4141
raise EmptyData('pdf file location is empty')

0 commit comments

Comments
 (0)