Skip to content

Commit 3a42ebb

Browse files
authored
Merge pull request #838 from opendatalab/release-0.9.0
Release 0.9.0
2 parents 765c6d7 + 1402479 commit 3a42ebb

File tree

591 files changed

+98911
-951
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

591 files changed

+98911
-951
lines changed

.github/workflows/cla.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
path-to-document: 'https://github.com/opendatalab/MinerU/blob/master/MinerU_CLA.md' # e.g. a CLA or a DCO document
3030
# branch should not be protected
3131
branch: 'master'
32-
allowlist: myhloli,dt-yy,Focusshang,renpengli01,icecraft,drunkpig,wangbinDL,qiangqiang199,GDDGCZ518,papayalove,conghui,quyuan
32+
allowlist: myhloli,dt-yy,Focusshang,renpengli01,icecraft,drunkpig,wangbinDL,qiangqiang199,GDDGCZ518,papayalove,conghui,quyuan,LollipopsAndWine
3333

3434
# the followings are the optional inputs - If the optional inputs are not given, then default values will be taken
3535
#remote-organization-name: enter the remote organization name where the signatures should be stored (Default is storing the signatures in the same repository)

.github/workflows/cli.yml

+9-11
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,18 @@ on:
1010
paths-ignore:
1111
- "cmds/**"
1212
- "**.md"
13-
- "**.yml"
1413
pull_request:
1514
branches:
1615
- "master"
1716
- "dev"
1817
paths-ignore:
1918
- "cmds/**"
2019
- "**.md"
21-
- "**.yml"
2220
workflow_dispatch:
2321
jobs:
2422
cli-test:
2523
runs-on: pdf
26-
timeout-minutes: 120
24+
timeout-minutes: 240
2725
strategy:
2826
fail-fast: true
2927

@@ -33,16 +31,16 @@ jobs:
3331
with:
3432
fetch-depth: 2
3533

36-
- name: install
34+
- name: install&test
3735
run: |
38-
echo $GITHUB_WORKSPACE && sh tests/retry_env.sh
39-
- name: unit test
40-
run: |
41-
cd $GITHUB_WORKSPACE && export PYTHONPATH=. && coverage run -m pytest tests/test_unit.py --cov=magic_pdf/ --cov-report term-missing --cov-report html
36+
source activate mineru
37+
conda env list
38+
pip show coverage
39+
# cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
40+
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
41+
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing
4242
cd $GITHUB_WORKSPACE && python tests/get_coverage.py
43-
- name: cli test
44-
run: |
45-
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
43+
cd $GITHUB_WORKSPACE && pytest -m P0 -s -v tests/test_cli/test_cli_sdk.py
4644
4745
notify_to_feishu:
4846
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}

.github/workflows/daily.yml

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3+
4+
name: mineru
5+
on:
6+
schedule:
7+
- cron: '0 22 * * *' # 每天晚上 10 点执行
8+
jobs:
9+
cli-test:
10+
runs-on: pdf
11+
timeout-minutes: 240
12+
strategy:
13+
fail-fast: true
14+
15+
steps:
16+
- name: PDF cli
17+
uses: actions/checkout@v3
18+
with:
19+
fetch-depth: 2
20+
21+
- name: install&test
22+
run: |
23+
source activate mineru
24+
conda env list
25+
pip show coverage
26+
# cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
27+
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
28+
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing
29+
cd $GITHUB_WORKSPACE && python tests/get_coverage.py
30+
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
31+
32+
notify_to_feishu:
33+
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
34+
needs: cli-test
35+
runs-on: pdf
36+
steps:
37+
- name: get_actor
38+
run: |
39+
metion_list="dt-yy"
40+
echo $GITHUB_ACTOR
41+
if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
42+
metion_list="xuchao"
43+
elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
44+
metion_list="zhaoxiaomeng"
45+
elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
46+
metion_list="xurui1"
47+
fi
48+
echo $metion_list
49+
echo "METIONS=$metion_list" >> "$GITHUB_ENV"
50+
echo ${{ env.METIONS }}
51+
52+
- name: notify
53+
run: |
54+
echo ${{ secrets.USER_ID }}
55+
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}

.github/workflows/huigui.yml

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2+
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3+
4+
name: mineru
5+
on:
6+
push:
7+
branches:
8+
- "master"
9+
- "dev"
10+
paths-ignore:
11+
- "cmds/**"
12+
- "**.md"
13+
workflow_dispatch:
14+
jobs:
15+
cli-test:
16+
runs-on: pdf
17+
timeout-minutes: 240
18+
strategy:
19+
fail-fast: true
20+
21+
steps:
22+
- name: PDF cli
23+
uses: actions/checkout@v3
24+
with:
25+
fetch-depth: 2
26+
27+
- name: install&test
28+
run: |
29+
source activate mineru
30+
conda env list
31+
pip show coverage
32+
# cd $GITHUB_WORKSPACE && sh tests/retry_env.sh
33+
cd $GITHUB_WORKSPACE && python tests/clean_coverage.py
34+
cd $GITHUB_WORKSPACE && coverage run -m pytest tests/unittest/ --cov=magic_pdf/ --cov-report html --cov-report term-missing
35+
cd $GITHUB_WORKSPACE && python tests/get_coverage.py
36+
cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli_sdk.py
37+
38+
notify_to_feishu:
39+
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'master') }}
40+
needs: cli-test
41+
runs-on: pdf
42+
steps:
43+
- name: get_actor
44+
run: |
45+
metion_list="dt-yy"
46+
echo $GITHUB_ACTOR
47+
if [[ $GITHUB_ACTOR == "drunkpig" ]]; then
48+
metion_list="xuchao"
49+
elif [[ $GITHUB_ACTOR == "myhloli" ]]; then
50+
metion_list="zhaoxiaomeng"
51+
elif [[ $GITHUB_ACTOR == "icecraft" ]]; then
52+
metion_list="xurui1"
53+
fi
54+
echo $metion_list
55+
echo "METIONS=$metion_list" >> "$GITHUB_ENV"
56+
echo ${{ env.METIONS }}
57+
58+
- name: notify
59+
run: |
60+
echo ${{ secrets.USER_ID }}
61+
curl -X POST -H "Content-Type: application/json" -d '{"msg_type":"post","content":{"post":{"zh_cn":{"title":"'${{ github.repository }}' GitHubAction Failed","content":[[{"tag":"text","text":""},{"tag":"a","text":"Please click here for details ","href":"https://github.com/'${{ github.repository }}'/actions/runs/'${GITHUB_RUN_ID}'"},{"tag":"at","user_id":"'${{ secrets.USER_ID }}'"}]]}}}}' ${{ secrets.WEBHOOK_URL }}

.github/workflows/update_base.yml

-22
This file was deleted.

.gitignore

+50-39
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,50 @@
1-
*.tar
2-
*.tar.gz
3-
venv*/
4-
envs/
5-
slurm_logs/
6-
7-
sync1.sh
8-
data_preprocess_pj1
9-
data-preparation1
10-
__pycache__
11-
*.log
12-
*.pyc
13-
.vscode
14-
debug/
15-
*.ipynb
16-
.idea
17-
18-
# vscode history
19-
.history
20-
21-
.DS_Store
22-
.env
23-
24-
bad_words/
25-
bak/
26-
27-
app/tests/*
28-
temp/
29-
tmp/
30-
tmp
31-
.vscode
32-
.vscode/
33-
ocr_demo
34-
35-
/app/common/__init__.py
36-
/magic_pdf/config/__init__.py
37-
source.dev.env
38-
39-
tmp
1+
*.tar
2+
*.tar.gz
3+
*.zip
4+
venv*/
5+
envs/
6+
slurm_logs/
7+
8+
sync1.sh
9+
data_preprocess_pj1
10+
data-preparation1
11+
__pycache__
12+
*.log
13+
*.pyc
14+
.vscode
15+
debug/
16+
*.ipynb
17+
.idea
18+
19+
# vscode history
20+
.history
21+
22+
.DS_Store
23+
.env
24+
25+
bad_words/
26+
bak/
27+
28+
app/tests/*
29+
temp/
30+
tmp/
31+
tmp
32+
.vscode
33+
.vscode/
34+
ocr_demo
35+
.coveragerc
36+
/app/common/__init__.py
37+
/magic_pdf/config/__init__.py
38+
source.dev.env
39+
40+
tmp
41+
42+
projects/web/node_modules
43+
projects/web/dist
44+
45+
projects/web_demo/web_demo/static/
46+
cli_debug/
47+
debug_utils/
48+
49+
# sphinx docs
50+
_build/

.pre-commit-config.yaml

+3-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ repos:
33
rev: 5.0.4
44
hooks:
55
- id: flake8
6-
args: ["--max-line-length=120", "--ignore=E131,E125,W503,W504,E203"]
6+
args: ["--max-line-length=150", "--ignore=E131,E125,W503,W504,E203"]
77
- repo: https://github.com/PyCQA/isort
88
rev: 5.11.5
99
hooks:
@@ -12,11 +12,12 @@ repos:
1212
rev: v0.32.0
1313
hooks:
1414
- id: yapf
15-
args: ["--style={based_on_style: google, column_limit: 120, indent_width: 4}"]
15+
args: ["--style={based_on_style: google, column_limit: 150, indent_width: 4}"]
1616
- repo: https://github.com/codespell-project/codespell
1717
rev: v2.2.1
1818
hooks:
1919
- id: codespell
20+
args: ['--skip', '*.json']
2021
- repo: https://github.com/pre-commit/pre-commit-hooks
2122
rev: v4.3.0
2223
hooks:

.readthedocs.yaml

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
version: 2
2+
3+
build:
4+
os: ubuntu-22.04
5+
tools:
6+
python: "3.10"
7+
8+
formats:
9+
- epub
10+
11+
python:
12+
install:
13+
- requirements: docs/zh_cn/requirements.txt
14+
15+
sphinx:
16+
configuration: docs/zh_cn/conf.py

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ RUN python3 -m venv /opt/mineru_venv
3131
RUN /bin/bash -c "source /opt/mineru_venv/bin/activate && \
3232
pip3 install --upgrade pip && \
3333
wget https://gitee.com/myhloli/MinerU/raw/master/requirements-docker.txt && \
34-
pip3 install -r requirements-docker.txt --extra-index-url https://wheels.myhloli.com -i https://pypi.tuna.tsinghua.edu.cn/simple && \
34+
pip3 install -r requirements-docker.txt --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple && \
3535
pip3 install paddlepaddle-gpu==3.0.0b1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/"
3636

3737
# Copy the configuration file template and install magic-pdf latest

LICENSE.md

+1
Original file line numberDiff line numberDiff line change
@@ -659,3 +659,4 @@ specific requirements.
659659
if any, to sign a "copyright disclaimer" for the program, if necessary.
660660
For more information on this, and how to apply and follow the GNU AGPL, see
661661
<https://www.gnu.org/licenses/>.
662+

0 commit comments

Comments
 (0)