-
Notifications
You must be signed in to change notification settings - Fork 30
Expand file tree
/
Copy pathextract_text.py
More file actions
70 lines (58 loc) · 2.2 KB
/
extract_text.py
File metadata and controls
70 lines (58 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# SPDX-License-Identifier: Apache-2.0
"""Simple demonstration of :class:`hwpx.tools.text_extractor.TextExtractor`."""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT / "src") not in sys.path:
sys.path.insert(0, str(ROOT / "src"))
from hwpx.tools.text_extractor import TextExtractor
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Print the paragraphs contained in an HWPX file.")
parser.add_argument("document", type=Path, help="Path to the .hwpx document to inspect")
parser.add_argument(
"--include-nested",
action="store_true",
help="Include paragraphs stored inside tables, shapes and other objects.",
)
parser.add_argument(
"--limit",
type=int,
default=10,
help="Maximum number of non-empty paragraphs to display (0 for no limit).",
)
parser.add_argument(
"--object-behavior",
choices=("skip", "placeholder", "nested"),
default="skip",
help="How to treat embedded objects encountered while reading a paragraph.",
)
parser.add_argument(
"--placeholder",
default="[object]",
help="Placeholder text to use when --object-behavior=placeholder.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
document = args.document
if not document.exists():
raise SystemExit(f"Document not found: {document}")
placeholder = args.placeholder if args.object_behavior == "placeholder" else None
with TextExtractor(document) as extractor:
printed = 0
for paragraph in extractor.iter_document_paragraphs(include_nested=args.include_nested):
text = paragraph.text(
object_behavior=args.object_behavior,
object_placeholder=placeholder,
).strip()
if not text:
continue
printed += 1
location = f"{paragraph.section.index}:{paragraph.index}"
print(f"[{location}] {text}")
if args.limit and printed >= args.limit:
break
if __name__ == "__main__":
main()