Skip to content

Commit e5856f0

Browse files
staarPeterStaar-IBMdolfim-ibmgithub-actions[bot]rmdg88
authored
feat: add an experimental v2 parser to improve performance (#29)
--------- Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Signed-off-by: rmdg88 <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Peter Staar <[email protected]> Co-authored-by: Peter W. J. Staar <[email protected]> Co-authored-by: Michele Dolfi <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Rui Dias Gomes <[email protected]> Co-authored-by: Christoph Auer <[email protected]> Co-authored-by: rmdg88 <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent 179b784 commit e5856f0

File tree

705 files changed

+3582536
-706
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

705 files changed

+3582536
-706
lines changed

.github/scripts/build_rhel.sh

+28-10
Original file line numberDiff line numberDiff line change
@@ -3,31 +3,49 @@
33
set -e # trigger failure on error - do not remove!
44
set -x # display command on output
55

6-
# Build the sdist
6+
# Build the Python package with Poetry
77
poetry build -f sdist
88

9-
# Compile the wheel from sdist in centos stream
9+
USE_SYSTEM_DEPS="ON"
10+
11+
docker build --progress=plain \
12+
--build-arg USE_SYSTEM_DEPS="$USE_SYSTEM_DEPS" \
13+
-f - . <<EOF
14+
# syntax=docker/dockerfile:1
1015
11-
docker build -f - . <<EOF
1216
FROM quay.io/centos/centos:stream9
17+
1318
RUN dnf config-manager --set-enabled crb
14-
# RUN dnf copr -y enable cheimes/deepsearch-glm rhel-9-x86_64
19+
20+
RUN dnf copr -y enable cheimes/deepsearch-glm rhel-9-x86_64
21+
1522
RUN dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
1623
&& dnf clean all
24+
1725
RUN dnf install -y --nodocs \
18-
gcc gcc-c++ git make cmake pkgconfig glibc-devel \
26+
autoconf automake binutils cmake gcc gcc-c++ git glibc-devel glibc-headers glibc-static kernel-devel libtool libstdc++-devel make ninja-build pkgconfig zlib-devel \
1927
python3.11 python3.11-pip python3.11-devel \
20-
libjpeg-turbo-devel libpng-devel qpdf-devel json-devel utf8cpp-devel zlib-devel \
21-
loguru-devel \
28+
libjpeg-turbo-devel libpng-devel qpdf-devel json-devel utf8cpp-devel \
2229
&& dnf clean all
23-
30+
31+
# # RUN dnf install -y --nodocs loguru-devel
32+
33+
# TEMPORARY loguru install method
34+
# https://koji.fedoraproject.org/koji/buildinfo?buildID=2563067
35+
RUN curl -O https://kojipkgs.fedoraproject.org//packages/loguru/2.2.0%5E20230406git4adaa18/5.el9/x86_64/loguru-2.2.0%5E20230406git4adaa18-5.el9.x86_64.rpm
36+
RUN dnf install -y loguru-2.2.0%5E20230406git4adaa18-5.el9.x86_64.rpm
37+
RUN curl -O https://kojipkgs.fedoraproject.org//packages/loguru/2.2.0%5E20230406git4adaa18/5.el9/x86_64/loguru-devel-2.2.0%5E20230406git4adaa18-5.el9.x86_64.rpm
38+
RUN dnf install -y loguru-devel-2.2.0%5E20230406git4adaa18-5.el9.x86_64.rpm
39+
2440
RUN mkdir /src
41+
2542
COPY ./dist/*.tar.gz /src/
2643
27-
RUN USE_SYSTEM_DEPS=ON pip3.11 install /src/docling_parse*.tar.gz \
28-
&& python3.11 -c 'from docling_parse.docling_parse import pdf_parser'
44+
RUN USE_SYSTEM_DEPS=\$USE_SYSTEM_DEPS pip3.11 install /src/docling_parse*.tar.gz \
45+
&& python3.11 -c 'from docling_parse.docling_parse import pdf_parser, pdf_parser_v2'
2946
3047
COPY ./tests /src/tests
48+
3149
RUN cd /src \
3250
&& pip3.11 install pytest \
3351
&& pytest

CMakeLists.txt

+51-27
Original file line numberDiff line numberDiff line change
@@ -100,16 +100,16 @@ if(NOT USE_SYSTEM_DEPS)
100100
endif()
101101
endif()
102102

103-
104103
# include dependencies
104+
include(cmake/extlib_cxxopts.cmake)
105+
include(cmake/extlib_loguru.cmake)
105106
include(cmake/extlib_json.cmake)
106107
include(cmake/extlib_utf8.git.cmake)
107108
include(cmake/extlib_jpeg.cmake)
108109
include(cmake/extlib_qpdf_v11.cmake)
109110

110111
# aggregate the targets created by the dependencies
111-
set(DEPENDENCIES qpdf jpeg utf8 json)
112-
112+
set(DEPENDENCIES qpdf jpeg utf8 json loguru cxxopts)
113113

114114
# ************************
115115
# *** libraries ***
@@ -136,21 +136,53 @@ include_directories(${SUBDIRS})
136136
# *** Executables ***
137137
# **********************
138138

139-
add_executable(parse.exe "${TOPLEVEL_PREFIX_PATH}/app/parse.cpp")
140-
set_property(TARGET parse.exe PROPERTY CXX_STANDARD 20)
141-
add_dependencies(parse.exe ${DEPENDENCIES})
142-
target_include_directories(parse.exe INTERFACE ${DEPENDENCIES})
143-
target_link_libraries(parse.exe ${DEPENDENCIES} ${LIB_LINK})
139+
string(REPLACE ";" ";" SUBDIRS_SEMICOLON "${SUBDIRS}")
140+
message(STATUS "subdirs: ${SUBDIRS_SEMICOLON}")
141+
142+
string(REPLACE ";" ";" DEPENDENCIES_SEMICOLON "${DEPENDENCIES}")
143+
message(STATUS "cmake dependencies: ${DEPENDENCIES_SEMICOLON}")
144+
145+
string(REPLACE ";" ";" LIBLINK_SEMICOLON "${LIB_LINK}")
146+
message(STATUS "cmake lib-link: ${LIBLINK_SEMICOLON}")
147+
148+
add_executable(parse_v1.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_v1.cpp")
149+
add_executable(parse_v2.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_v2.cpp")
150+
add_executable(parse_v2_fonts.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_v2_fonts.cpp")
151+
152+
set_property(TARGET parse_v1.exe PROPERTY CXX_STANDARD 20)
153+
set_property(TARGET parse_v2.exe PROPERTY CXX_STANDARD 20)
154+
set_property(TARGET parse_v2_fonts.exe PROPERTY CXX_STANDARD 20)
155+
156+
add_dependencies(parse_v1.exe ${DEPENDENCIES})
157+
add_dependencies(parse_v2.exe ${DEPENDENCIES})
158+
add_dependencies(parse_v2_fonts.exe ${DEPENDENCIES})
159+
160+
target_include_directories(parse_v1.exe INTERFACE ${DEPENDENCIES})
161+
target_include_directories(parse_v2.exe INTERFACE ${DEPENDENCIES})
162+
target_include_directories(parse_v2_fonts.exe INTERFACE ${DEPENDENCIES})
163+
164+
target_link_libraries(parse_v1.exe ${DEPENDENCIES} ${LIB_LINK})
165+
target_link_libraries(parse_v2.exe ${DEPENDENCIES} ${LIB_LINK})
166+
target_link_libraries(parse_v2_fonts.exe ${DEPENDENCIES} ${LIB_LINK})
144167

145168
# **********************
146169
# *** Libraries ***
147170
# **********************
148171

149-
add_library(libparse STATIC "${TOPLEVEL_PREFIX_PATH}/app/parse.cpp")
150-
add_dependencies(libparse ${DEPENDENCIES})
151-
target_include_directories(libparse INTERFACE ${DEPENDENCIES})
152-
set_target_properties(libparse PROPERTIES POSITION_INDEPENDENT_CODE ON)
153-
target_link_libraries(libparse ${DEPENDENCIES} ${LIB_LINK})
172+
add_library(parse_v1 STATIC "${TOPLEVEL_PREFIX_PATH}/app/parse_v1.cpp")
173+
add_library(parse_v2 STATIC "${TOPLEVEL_PREFIX_PATH}/app/parse_v1.cpp")
174+
175+
add_dependencies(parse_v1 ${DEPENDENCIES})
176+
add_dependencies(parse_v2 ${DEPENDENCIES})
177+
178+
target_include_directories(parse_v1 INTERFACE ${DEPENDENCIES})
179+
target_include_directories(parse_v2 INTERFACE ${DEPENDENCIES})
180+
181+
set_target_properties(parse_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
182+
set_target_properties(parse_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
183+
184+
target_link_libraries(parse_v1 ${DEPENDENCIES} ${LIB_LINK})
185+
target_link_libraries(parse_v2 ${DEPENDENCIES} ${LIB_LINK})
154186

155187
# ***************************
156188
# *** Python-binding ***
@@ -161,25 +193,17 @@ target_link_libraries(libparse ${DEPENDENCIES} ${LIB_LINK})
161193
find_package(pybind11 CONFIG REQUIRED)
162194

163195
pybind11_add_module(docling_parse "${TOPLEVEL_PREFIX_PATH}/app/pybind_parse.cpp")
164-
add_dependencies(docling_parse libparse)
196+
197+
add_dependencies(docling_parse parse_v1 parse_v2)
198+
165199
target_include_directories(docling_parse INTERFACE ${DEPENDENCIES})
200+
166201
target_compile_definitions(docling_parse PRIVATE VERSION_INFO=${CMAKE_PROJECT_VERSION})
167-
target_link_libraries(docling_parse PRIVATE libparse)
202+
203+
target_link_libraries(docling_parse PRIVATE parse_v1 parse_v2)
168204

169205
# *****************
170206
# *** Install ***
171207
# *****************
172208

173209
install(TARGETS docling_parse DESTINATION "${TOPLEVEL_PREFIX_PATH}/docling_parse")
174-
175-
# *****************
176-
# *** Testing ***
177-
# *****************
178-
179-
#function(do_test target arg result)
180-
# add_test(NAME Comp${arg} COMMAND ${target} ${arg})
181-
# set_tests_properties(Comp${arg} PROPERTIES PASS_REGULAR_EXPRESSION ${result})
182-
# endfunction()
183-
184-
# do a bunch of result based tests
185-
# do_test(Tutorial 4 "4 is 2")

README.md

+70-49
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,32 @@
77
[![Platforms](https://img.shields.io/badge/platform-macos%20|%20linux%20|%20windows-blue)](https://github.com/DS4SD/docling-parse/)
88
[![License MIT](https://img.shields.io/github/license/DS4SD/docling-parse)](https://opensource.org/licenses/MIT)
99

10-
Simple package to extract text with coordinates from programmatic PDFs.
11-
This package is part of the [Docling](https://github.com/DS4SD/docling) conversion.
12-
10+
Simple package to extract text, paths and bitmap images with coordinates from programmatic PDFs.
11+
This package is used in the [Docling](https://github.com/DS4SD/docling) PDF conversion.
12+
13+
<table>
14+
<tr>
15+
<th>Version</th>
16+
<th>Original</th>
17+
<th>Word-level</th>
18+
<th>Snippet-level</th>
19+
<th>Performance</th>
20+
</tr>
21+
<tr>
22+
<th>V1</th>
23+
<td rowspan="2"><img src="./docs/example_visualisations/2305.14962v1.pdf_page=0.png" alt="screenshot" width="100"/></td>
24+
<td>Not Supported</td>
25+
<td><img src="./docs/example_visualisations/2305.14962v1.pdf_page=0.v1.png" alt="v1 snippet" width="100"/></td>
26+
<td>~0.250 page/sec</td>
27+
</tr>
28+
<tr>
29+
<th>V2</th>
30+
<!-- The "Original" column image spans from the previous row -->
31+
<td><img src="./docs/example_visualisations/2305.14962v1.pdf_page=0.v2.original.png" alt="v1 word" width="100"/></td>
32+
<td><img src="./docs/example_visualisations/2305.14962v1.pdf_page=0.v2.sanitized.png" alt="v2 snippet" width="100"/></td>
33+
<td>~0.050 page/sec <br><br>[~5-10X faster than v1]</td>
34+
</tr>
35+
</table>
1336

1437
## Quick start
1538

@@ -19,13 +42,13 @@ Install the package from Pypi
1942
pip install docling-parse
2043
```
2144

22-
Convert a PDF
45+
Convert a PDF (look in the [visualise.py](docling_parse/visualise.py) for a more detailed information)
2346

2447
```python
25-
from docling_parse.docling_parse import pdf_parser
48+
from docling_parse.docling_parse import pdf_parser_v2
2649

2750
# Do this only once to load fonts (avoid initialising it many times)
28-
parser = pdf_parser()
51+
parser = pdf_parser_v2()
2952

3053
# parser.set_loglevel(1) # 1=error, 2=warning, 3=success, 4=info
3154

@@ -64,39 +87,7 @@ for page in range(0, num_pages):
6487
# parsed page is the first one!
6588
json_page = json_doc["pages"][0]
6689

67-
page_dimensions = [json_page["dimensions"]["width"], json_page["dimensions"]["height"]]
68-
69-
# find text cells
70-
cells=[]
71-
for cell_id,cell in enumerate(json_page["cells"]):
72-
cells.append([page,
73-
cell_id,
74-
cell["content"]["rnormalized"], # text
75-
cell["box"]["device"][0], # x0 (lower left x)
76-
cell["box"]["device"][1], # y0 (lower left y)
77-
cell["box"]["device"][2], # x1 (upper right x)
78-
cell["box"]["device"][3], # y1 (upper right y)
79-
])
80-
81-
# find bitmap images
82-
images=[]
83-
for image_id,image in enumerate(json_page["images"]):
84-
images.append([page,
85-
image_id,
86-
image["box"][0], # x0 (lower left x)
87-
image["box"][1], # y0 (lower left y)
88-
image["box"][2], # x1 (upper right x)
89-
image["box"][3], # y1 (upper right y)
90-
])
91-
92-
# find paths
93-
paths=[]
94-
for path_id,path in enumerate(json_page["paths"]):
95-
paths.append([page,
96-
path_id,
97-
path["x-values"], # array of x values
98-
path["y-values"], # array of y values
99-
])
90+
# <Insert your own code>
10091

10192
# Unload the (QPDF) document and buffers
10293
parser.unload_document(doc_key)
@@ -128,10 +119,38 @@ To build the parse, simply run the following command in the root folder,
128119
rm -rf build; cmake -B ./build; cd build; make
129120
```
130121

131-
You can run the parser from your build folder with
122+
You can run the parser from your build folder. Example from parse_v1,
123+
124+
```sh
125+
% ./parse_v1.exe -h
126+
A program to process PDF files or configuration files
127+
Usage:
128+
PDFProcessor [OPTION...]
129+
130+
-i, --input arg Input PDF file
131+
-c, --config arg Config file
132+
--create-config arg Create config file
133+
-o, --output arg Output file
134+
-l, --loglevel arg loglevel [error;warning;success;info]
135+
-h, --help Print usage
136+
```
137+
138+
Example from parse_v2,
132139

133140
```sh
134-
./parse.exe <input-file> <optional-logging:true>
141+
% ./parse_v2.exe -h
142+
program to process PDF files or configuration files
143+
Usage:
144+
PDFProcessor [OPTION...]
145+
146+
-i, --input arg Input PDF file
147+
-c, --config arg Config file
148+
--create-config arg Create config file
149+
-p, --page arg Pages to process (default: -1 for all) (default:
150+
-1)
151+
-o, --output arg Output file
152+
-l, --loglevel arg loglevel [error;warning;success;info]
153+
-h, --help Print usage
135154
```
136155

137156
If you dont have an input file, then a template input file will be printed on the terminal.
@@ -148,7 +167,7 @@ poetry build
148167
To test the package, run,
149168

150169
```
151-
poetry run pytest ./tests/test_parse.py
170+
poetry run pytest ./tests -v -s
152171
```
153172

154173

@@ -162,13 +181,15 @@ Please read [Contributing to Docling Parse](https://github.com/DS4SD/docling-par
162181
If you use Docling in your projects, please consider citing the following:
163182

164183
```bib
165-
@software{Docling,
166-
author = {Deep Search Team},
167-
month = {7},
168-
title = {{Docling}},
169-
url = {https://github.com/DS4SD/docling},
170-
version = {main},
171-
year = {2024}
184+
@techreport{Docling,
185+
author = {Deep Search Team},
186+
month = {8},
187+
title = {Docling Technical Report},
188+
url = {https://arxiv.org/abs/2408.09869},
189+
eprint = {2408.09869},
190+
doi = {10.48550/arXiv.2408.09869},
191+
version = {1.0.0},
192+
year = {2024}
172193
}
173194
```
174195

0 commit comments

Comments
 (0)