-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelpers.py
53 lines (38 loc) · 1.62 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from typing import Dict
from ebooklib import epub
from bs4 import BeautifulSoup, Tag
def parse_epub_content(epub_path: str) -> Dict:
def paragraphize_node(node):
return [' '.join(p.get_text().split()) for p in node.children if isinstance(p, Tag) and len(' '.join(p.get_text().split())) > 0]
book = epub.read_epub(epub_path)
article_name = book.title
content = ''
for item in book.items:
if item.media_type and item.media_type == 'application/xhtml+xml':
content += BeautifulSoup(item.content.decode("utf-8"), 'html5lib').get_text()
sections = []
main_content = BeautifulSoup(book.items[0].content.decode("utf-8"), 'html5lib')
body = main_content.find('div', {'class': 'body'})
for child in body.children:
if isinstance(child, Tag):
try:
name = child.find('h2').get_text()
except:
name = ''
section = dict(
name=name,
paragraphs=paragraphize_node(child)
)
sections.append(section)
appendix_paragraphs = []
for item in book.items[1:]:
if item.media_type and item.media_type == 'application/xhtml+xml':
appendix_body = BeautifulSoup(item.content.decode("utf-8"), 'html5lib').find('body')
appendix_paragraphs.extend(paragraphize_node(appendix_body))
appendix_paragraphs = [p for p in appendix_paragraphs if p != '[Back]']
sections.append(dict(name='Appendix', paragraphs=appendix_paragraphs))
return dict(
article_name=article_name,
sections=sections,
content=content
)