Skip to content

Commit adc735b

Browse files
author
TerraceCN
committed
Init
0 parents  commit adc735b

9 files changed

+222
-0
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
/__pycache__/
2+
*.mobi
3+
*.txt
4+
*.epub

kindlegen

27.3 MB
Binary file not shown.

main.py

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# -*- coding: utf-8 -*-
2+
import os
3+
import sys
4+
from time import time
5+
6+
from pyepub import EPUB
7+
8+
filename = sys.argv[1]
9+
10+
# 加载ePub文件
11+
epub = EPUB(filename)
12+
13+
# EPUB对象属性
14+
sha1 = epub.sha1 # ePub文件SHA-1(str)
15+
nav = epub.nav # ePub目录(list)
16+
nav_point = epub.nav_point # ePub章节(dict)
17+
items = epub.items # ePub文件(dict)
18+
metadata = epub.metadata # ePub元数据(dict)
19+
20+
# 遍历ePub元数据(可用epub[name]快速访问)
21+
for name, data in metadata.items():
22+
print("%s:" % name.capitalize(), data)
23+
24+
# 获取ePub文件内资源
25+
cover = epub.get_file(epub["cover"])
26+
27+
# 转换至mobi格式
28+
t = time()
29+
with open(os.path.splitext(filename)[0] + ".mobi", "wb") as file:
30+
file.write(epub.convert_to_mobi())
31+
t1 = time() - t
32+
print("'mobi' file has been saved to '%s'[%.2fs]" % (os.path.splitext(filename)[0] + ".mobi", t1))
33+
34+
# 转换至txt格式
35+
t = time()
36+
with open(os.path.splitext(filename)[0] + ".txt", "wb") as file:
37+
file.write(epub.convert_to_txt().encode("utf-8"))
38+
t2 = time() - t
39+
print("'txt' file has been saved to '%s'[%.2fs]" % (os.path.splitext(filename)[0] + ".txt", t2))
40+

pyepub/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# -*- coding: utf-8 -*-
2+
from .epub import EPUB
153 Bytes
Binary file not shown.
4.39 KB
Binary file not shown.
1.15 KB
Binary file not shown.

pyepub/epub.py

+137
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# -*- coding: utf-8 -*-
2+
import os
3+
import shutil
4+
from zipfile import ZipFile
5+
from subprocess import Popen, PIPE
6+
from hashlib import sha1
7+
8+
from bs4 import BeautifulSoup as BS
9+
10+
from .html import HTML
11+
12+
13+
class EPUB:
14+
15+
def __init__(self, filename):
16+
self.filename = filename
17+
self._file = ZipFile(filename, "r")
18+
self.sha1 = None
19+
self.nav = []
20+
self.nav_point = {}
21+
self.items = {}
22+
self.metadata = {}
23+
self.check_mimetype()
24+
self._sha1()
25+
self.read_ncx()
26+
self.read_opf()
27+
28+
def __getitem__(self, name):
29+
assert (name in self.metadata), KeyError(
30+
"'%s' has no metadata named '%s'" % (self.filename,
31+
name))
32+
return self.metadata[name]
33+
34+
def _sha1(self):
35+
if self.sha1:
36+
return self.sha1
37+
with open(self.filename, "rb") as file:
38+
self.sha1 = sha1(file.read()).hexdigest()
39+
return self.sha1
40+
41+
def check_mimetype(self):
42+
try:
43+
mimetype = self._file.read("mimetype")
44+
assert mimetype == b"application/epub+zip", \
45+
TypeError("'%s' is not a ePub file" % self.filename)
46+
except KeyError:
47+
raise TypeError("'%s' is not a ePub file" % self.filename)
48+
49+
def read_xml(self, filename, decoder="lxml"):
50+
try:
51+
xml = self._file.read(filename)
52+
return BS(xml, decoder)
53+
except KeyError:
54+
raise FileNotFoundError(
55+
"'%s' has no file named '%s'" % (self.filename, filename))
56+
57+
def read_ncx(self):
58+
ncx = self.read_xml("OEBPS/toc.ncx")
59+
for nav in ncx.find_all("navpoint"):
60+
self.nav_point[nav["id"]] = {
61+
"id": nav["id"],
62+
"title": nav.navlabel.text.strip(),
63+
"content": os.path.join("OEBPS", nav.content["src"]),
64+
"play_order": nav.PlayOrder or len(self.nav) + 1}
65+
self.nav.append(self.nav_point[nav["id"]])
66+
67+
def read_opf(self):
68+
opf = self.read_xml("OEBPS/content.opf")
69+
for data in opf.metadata.contents:
70+
if data.name is None:
71+
continue
72+
elif data.name == "meta":
73+
self.metadata[data["name"]] = data["content"]
74+
elif data.name[:3] == "dc:":
75+
self.metadata[data.name[3:]] = data.text
76+
else:
77+
self.metadata[data.name] = data.text
78+
for item in opf.find_all("item"):
79+
self.items[item["id"]] = {
80+
"id": item["id"],
81+
"href": os.path.join("OEBPS", item["href"]),
82+
"media-type": item["media-type"]}
83+
84+
def get_file(self, name):
85+
assert (name in self.items), FileNotFoundError(
86+
"'%s' has no file named '%s'" % (self.filename, name))
87+
return self._file.read(self.items[name]["href"])
88+
89+
def tmp(self):
90+
_path = os.path.join("/tmp", self.sha1)
91+
if not os.path.exists(_path):
92+
os.mkdir(_path)
93+
return _path
94+
95+
def fix_opf(self):
96+
opf = self.read_xml("OEBPS/content.opf")
97+
_list = set()
98+
for item in opf.find_all("item"):
99+
if item["id"] in _list:
100+
item.extract()
101+
else:
102+
_list.add(item["id"])
103+
return str(opf)
104+
105+
def convert_to_mobi(self, kindlegen=None):
106+
if not kindlegen:
107+
kindlegen = os.path.join(os.getcwd(), "kindlegen")
108+
if not os.path.exists(kindlegen):
109+
raise FileNotFoundError("Kindlegen not found")
110+
_tmp = self.tmp()
111+
self._file.extractall(_tmp)
112+
with open(os.path.join(_tmp, "OEBPS/content.opf"), "wb") as file:
113+
file.write(self.fix_opf().encode("utf-8"))
114+
ps = Popen("%s -dont_append_source OEBPS/content.opf" % kindlegen,
115+
shell=True,
116+
cwd=_tmp,
117+
stdout=PIPE)
118+
ps.wait()
119+
with open(os.path.join(_tmp, "OEBPS/content.mobi"), "rb") as file:
120+
mobi = file.read()
121+
shutil.rmtree(_tmp)
122+
return mobi
123+
124+
def convert_to_txt(self):
125+
txt = self["title"]
126+
for nav in self.nav:
127+
_title = nav["title"]
128+
_file = nav["content"]
129+
_text = HTML(self._file.read(_file)).purify().strip()
130+
if not _text:
131+
continue
132+
txt += "\n\n\n>>> %s <<<\n\n\n" % _title
133+
txt += _text
134+
txt += "\n\n\n>>> 本章结束 <<<\n\n\n"
135+
txt += ">>>>> The End <<<<<"
136+
return txt
137+

pyepub/html.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# -*- coding: utf-8 -*-
2+
from bs4 import BeautifulSoup as BS
3+
4+
5+
class HTML:
6+
7+
def __init__(self, html):
8+
self.bs = BS(html, "lxml")
9+
self.txt = ""
10+
self.inline = ["a", "b", "em", "i", "span", "strong"]
11+
self.block = ["div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]
12+
self.ignore = ["img"]
13+
14+
def plain(self, element):
15+
c = element
16+
if len(list(element.contents)) == 1:
17+
if c.name == "br":
18+
return "\n"
19+
elif c.name in self.ignore:
20+
return ""
21+
elif c.name in self.inline:
22+
return element.text
23+
elif c.name in self.block:
24+
return element.text + "\n"
25+
txt = ""
26+
for c in element.children:
27+
if c.name == "br":
28+
return "\n"
29+
elif c.name in self.ignore:
30+
continue
31+
elif c.name in self.inline:
32+
txt += self.plain(c)
33+
elif c.name in self.block:
34+
txt += self.plain(c) + "\n"
35+
return txt
36+
37+
def purify(self):
38+
return self.plain(self.bs.body)
39+

0 commit comments

Comments
 (0)