1
+ # -*- coding: utf-8 -*-
2
+ import os
3
+ import shutil
4
+ from zipfile import ZipFile
5
+ from subprocess import Popen , PIPE
6
+ from hashlib import sha1
7
+
8
+ from bs4 import BeautifulSoup as BS
9
+
10
+ from .html import HTML
11
+
12
+
13
+ class EPUB :
14
+
15
+ def __init__ (self , filename ):
16
+ self .filename = filename
17
+ self ._file = ZipFile (filename , "r" )
18
+ self .sha1 = None
19
+ self .nav = []
20
+ self .nav_point = {}
21
+ self .items = {}
22
+ self .metadata = {}
23
+ self .check_mimetype ()
24
+ self ._sha1 ()
25
+ self .read_ncx ()
26
+ self .read_opf ()
27
+
28
+ def __getitem__ (self , name ):
29
+ assert (name in self .metadata ), KeyError (
30
+ "'%s' has no metadata named '%s'" % (self .filename ,
31
+ name ))
32
+ return self .metadata [name ]
33
+
34
+ def _sha1 (self ):
35
+ if self .sha1 :
36
+ return self .sha1
37
+ with open (self .filename , "rb" ) as file :
38
+ self .sha1 = sha1 (file .read ()).hexdigest ()
39
+ return self .sha1
40
+
41
+ def check_mimetype (self ):
42
+ try :
43
+ mimetype = self ._file .read ("mimetype" )
44
+ assert mimetype == b"application/epub+zip" , \
45
+ TypeError ("'%s' is not a ePub file" % self .filename )
46
+ except KeyError :
47
+ raise TypeError ("'%s' is not a ePub file" % self .filename )
48
+
49
+ def read_xml (self , filename , decoder = "lxml" ):
50
+ try :
51
+ xml = self ._file .read (filename )
52
+ return BS (xml , decoder )
53
+ except KeyError :
54
+ raise FileNotFoundError (
55
+ "'%s' has no file named '%s'" % (self .filename , filename ))
56
+
57
+ def read_ncx (self ):
58
+ ncx = self .read_xml ("OEBPS/toc.ncx" )
59
+ for nav in ncx .find_all ("navpoint" ):
60
+ self .nav_point [nav ["id" ]] = {
61
+ "id" : nav ["id" ],
62
+ "title" : nav .navlabel .text .strip (),
63
+ "content" : os .path .join ("OEBPS" , nav .content ["src" ]),
64
+ "play_order" : nav .PlayOrder or len (self .nav ) + 1 }
65
+ self .nav .append (self .nav_point [nav ["id" ]])
66
+
67
+ def read_opf (self ):
68
+ opf = self .read_xml ("OEBPS/content.opf" )
69
+ for data in opf .metadata .contents :
70
+ if data .name is None :
71
+ continue
72
+ elif data .name == "meta" :
73
+ self .metadata [data ["name" ]] = data ["content" ]
74
+ elif data .name [:3 ] == "dc:" :
75
+ self .metadata [data .name [3 :]] = data .text
76
+ else :
77
+ self .metadata [data .name ] = data .text
78
+ for item in opf .find_all ("item" ):
79
+ self .items [item ["id" ]] = {
80
+ "id" : item ["id" ],
81
+ "href" : os .path .join ("OEBPS" , item ["href" ]),
82
+ "media-type" : item ["media-type" ]}
83
+
84
+ def get_file (self , name ):
85
+ assert (name in self .items ), FileNotFoundError (
86
+ "'%s' has no file named '%s'" % (self .filename , name ))
87
+ return self ._file .read (self .items [name ]["href" ])
88
+
89
+ def tmp (self ):
90
+ _path = os .path .join ("/tmp" , self .sha1 )
91
+ if not os .path .exists (_path ):
92
+ os .mkdir (_path )
93
+ return _path
94
+
95
+ def fix_opf (self ):
96
+ opf = self .read_xml ("OEBPS/content.opf" )
97
+ _list = set ()
98
+ for item in opf .find_all ("item" ):
99
+ if item ["id" ] in _list :
100
+ item .extract ()
101
+ else :
102
+ _list .add (item ["id" ])
103
+ return str (opf )
104
+
105
+ def convert_to_mobi (self , kindlegen = None ):
106
+ if not kindlegen :
107
+ kindlegen = os .path .join (os .getcwd (), "kindlegen" )
108
+ if not os .path .exists (kindlegen ):
109
+ raise FileNotFoundError ("Kindlegen not found" )
110
+ _tmp = self .tmp ()
111
+ self ._file .extractall (_tmp )
112
+ with open (os .path .join (_tmp , "OEBPS/content.opf" ), "wb" ) as file :
113
+ file .write (self .fix_opf ().encode ("utf-8" ))
114
+ ps = Popen ("%s -dont_append_source OEBPS/content.opf" % kindlegen ,
115
+ shell = True ,
116
+ cwd = _tmp ,
117
+ stdout = PIPE )
118
+ ps .wait ()
119
+ with open (os .path .join (_tmp , "OEBPS/content.mobi" ), "rb" ) as file :
120
+ mobi = file .read ()
121
+ shutil .rmtree (_tmp )
122
+ return mobi
123
+
124
+ def convert_to_txt (self ):
125
+ txt = self ["title" ]
126
+ for nav in self .nav :
127
+ _title = nav ["title" ]
128
+ _file = nav ["content" ]
129
+ _text = HTML (self ._file .read (_file )).purify ().strip ()
130
+ if not _text :
131
+ continue
132
+ txt += "\n \n \n >>> %s <<<\n \n \n " % _title
133
+ txt += _text
134
+ txt += "\n \n \n >>> 本章结束 <<<\n \n \n "
135
+ txt += ">>>>> The End <<<<<"
136
+ return txt
137
+
0 commit comments