1
- import os
2
- import pathlib
3
1
import shutil
4
2
import tempfile
5
3
import zipfile
6
4
from multiprocessing .dummy import Pool
5
+ from pathlib import Path
7
6
from pprint import pprint as pp
8
7
9
8
import apsw
10
9
import backoff
11
10
from kiwixstorage import KiwixStorage
12
- from path import Path
13
11
14
- from gutenberg2zim .constants import TMP_FOLDER , logger
12
+ from gutenberg2zim .constants import TMP_FOLDER_PATH , logger
15
13
from gutenberg2zim .database import Book , BookFormat
16
14
from gutenberg2zim .export import fname_for , get_list_of_filtered_books
17
15
from gutenberg2zim .s3 import download_from_cache
36
34
# return False
37
35
38
36
39
- def handle_zipped_epub (zippath , book , dst_dir : pathlib . Path ):
37
+ def handle_zipped_epub (zippath : Path , book : Book , dst_dir : Path ) -> bool :
40
38
def clfn (fn ):
41
- return os . path . join ( * os . path . split ( fn )[ 1 :])
39
+ return Path ( fn ). name
42
40
43
41
def is_safe (fname ):
44
- fname = ensure_unicode (clfn (fname ))
45
- if Path (fname ).basename () == fname :
42
+ name = ensure_unicode (clfn (fname ))
43
+ if Path (fname ).name == name :
46
44
return True
47
- return fname == os . path . join ( "images" , Path (fname ).splitpath ()[ - 1 ])
45
+ return fname == f "images/ { Path (fname ).name } "
48
46
49
47
zipped_files = []
50
48
# create temp directory to extract to
51
- tmpd = tempfile .mkdtemp (dir = TMP_FOLDER )
49
+ tmpd = tempfile .mkdtemp (dir = TMP_FOLDER_PATH )
52
50
try :
53
51
with zipfile .ZipFile (zippath , "r" ) as zf :
54
52
# check that there is no insecure data (absolute names)
55
53
if sum ([1 for n in zf .namelist () if not is_safe (ensure_unicode (n ))]):
56
- Path ( tmpd ). rmtree_p ( )
54
+ shutil . rmtree ( tmpd , ignore_errors = True )
57
55
return False
58
56
# zipped_files = [clfn(fn) for fn in zf.namelist()]
59
57
zipped_files = zf .namelist ()
@@ -64,7 +62,7 @@ def is_safe(fname):
64
62
# file is not a zip file when it should be.
65
63
# don't process it anymore as we don't know what to do.
66
64
# could this be due to an incorrect/incomplete download?
67
- return
65
+ return False
68
66
69
67
# is there multiple HTML files in ZIP ? (rare)
70
68
mhtml = (
@@ -73,25 +71,26 @@ def is_safe(fname):
73
71
# move all extracted files to proper locations
74
72
for zipped_file in zipped_files :
75
73
# skip folders
76
- if not Path (zipped_file ).ext :
74
+ if not Path (zipped_file ).resolve (). is_file () :
77
75
continue
78
76
79
- src = os . path . join ( tmpd , zipped_file )
80
- if os . path . exists (src ):
81
- fname = Path (zipped_file ).basename ()
77
+ src = ( Path ( tmpd ) / zipped_file ). resolve ( )
78
+ if src . exists ():
79
+ fname = Path (zipped_file ).name
82
80
83
81
if fname .endswith (".html" ) or fname .endswith (".htm" ):
84
82
if mhtml :
85
83
if fname .startswith (f"{ book .id } -h." ):
86
- dst = dst_dir . joinpath ( f"{ book .id } .html" )
84
+ dst = dst_dir / f"{ book .id } .html"
87
85
else :
88
- dst = dst_dir . joinpath ( f"{ book .id } _{ fname } " )
86
+ dst = dst_dir / f"{ book .id } _{ fname } "
89
87
else :
90
- dst = dst_dir . joinpath ( f"{ book .id } .html" )
88
+ dst = dst_dir / f"{ book .id } .html"
91
89
else :
92
- dst = dst_dir .joinpath (f"{ book .id } _{ fname } " )
90
+ dst = dst_dir / f"{ book .id } _{ fname } "
91
+ dst = dst .resolve ()
93
92
try :
94
- Path ( src ). move ( str ( dst ) )
93
+ src . rename ( dst )
95
94
except Exception as e :
96
95
import traceback
97
96
@@ -100,14 +99,14 @@ def is_safe(fname):
100
99
raise
101
100
102
101
# delete temp directory and zipfile
103
- if Path ( zippath ). exists ():
104
- os . unlink ( zippath )
105
- Path ( tmpd ). rmtree_p ()
102
+ zippath . unlink ( missing_ok = True )
103
+ shutil . rmtree ( tmpd , ignore_errors = True )
104
+ return True
106
105
107
106
108
107
def download_book (
109
108
book : Book ,
110
- download_cache : str ,
109
+ download_cache : Path ,
111
110
formats : list [str ],
112
111
* ,
113
112
force : bool ,
@@ -124,13 +123,15 @@ def download_book(
124
123
if "html" not in formats :
125
124
formats .append ("html" )
126
125
127
- book_dir = pathlib .Path (download_cache ).joinpath (str (book .id ))
128
- optimized_dir = book_dir .joinpath ("optimized" )
129
- unoptimized_dir = book_dir .joinpath ("unoptimized" )
126
+ book_dir = download_cache / str (book .id )
127
+ optimized_dir = book_dir / "optimized"
128
+ unoptimized_dir = book_dir / "unoptimized"
129
+
130
130
unsuccessful_formats = []
131
131
for book_format in formats :
132
- unoptimized_fpath = unoptimized_dir .joinpath (fname_for (book , book_format ))
133
- optimized_fpath = optimized_dir .joinpath (archive_name_for (book , book_format ))
132
+ unoptimized_fpath = unoptimized_dir / fname_for (book , book_format )
133
+ unoptimized_fpath = unoptimized_dir / fname_for (book , book_format )
134
+ optimized_fpath = optimized_dir / archive_name_for (book , book_format )
134
135
135
136
# check if already downloaded
136
137
if (unoptimized_fpath .exists () or optimized_fpath .exists ()) and not force :
@@ -141,12 +142,10 @@ def download_book(
141
142
if book_format == "html" :
142
143
for fpath in book_dir .iterdir ():
143
144
if fpath .is_file () and fpath .suffix not in [".pdf" , ".epub" ]:
144
- fpath .unlink ()
145
+ fpath .unlink (missing_ok = True )
145
146
else :
146
- if unoptimized_fpath .exists ():
147
- unoptimized_fpath .unlink ()
148
- if optimized_fpath .exists ():
149
- optimized_fpath .unlink ()
147
+ unoptimized_fpath .unlink (missing_ok = True )
148
+ optimized_fpath .unlink (missing_ok = True )
150
149
# delete dirs which are empty
151
150
for dir_name in [optimized_dir , unoptimized_dir ]:
152
151
if not dir_name .exists ():
@@ -233,7 +232,7 @@ def download_book(
233
232
234
233
# HTML files are *sometime* available as ZIP files
235
234
if url .endswith (".zip" ):
236
- zpath = unoptimized_dir . joinpath ( f"{ fname_for (book , book_format )} .zip" )
235
+ zpath = unoptimized_dir / f"{ fname_for (book , book_format )} .zip"
237
236
238
237
etag = get_etag_from_url (url )
239
238
if s3_storage :
@@ -254,7 +253,11 @@ def download_book(
254
253
book .html_etag = etag # type: ignore
255
254
book .save ()
256
255
# extract zipfile
257
- handle_zipped_epub (zippath = zpath , book = book , dst_dir = unoptimized_dir )
256
+ handle_zipped_epub (
257
+ zippath = zpath ,
258
+ book = book ,
259
+ dst_dir = unoptimized_dir ,
260
+ )
258
261
else :
259
262
if (
260
263
url .endswith (".htm" )
@@ -329,10 +332,9 @@ def download_cover(book, book_dir, s3_storage, optimizer_version):
329
332
etag = get_etag_from_url (url )
330
333
downloaded_from_cache = False
331
334
cover = f"{ book .id } _cover_image.jpg"
332
- if (
333
- book_dir .joinpath ("optimized" ).joinpath (cover ).exists ()
334
- or book_dir .joinpath ("unoptimized" ).joinpath (cover ).exists ()
335
- ):
335
+ if (book_dir / "optimized" / cover ).exists () or (
336
+ book_dir / "unoptimized" / cover
337
+ ).exists ():
336
338
logger .debug (f"Cover already exists for book #{ book .id } " )
337
339
return
338
340
if s3_storage :
@@ -343,25 +345,25 @@ def download_cover(book, book_dir, s3_storage, optimizer_version):
343
345
book = book ,
344
346
etag = etag ,
345
347
book_format = "cover" ,
346
- dest_dir = book_dir . joinpath ( "optimized" ) ,
348
+ dest_dir = book_dir / "optimized" ,
347
349
s3_storage = s3_storage ,
348
350
optimizer_version = optimizer_version ,
349
351
)
350
352
if not downloaded_from_cache :
351
353
logger .debug (f"Downloading { url } " )
352
- if download_file (url , book_dir . joinpath ( "unoptimized" ). joinpath ( cover ) ):
354
+ if download_file (url , book_dir / "unoptimized" / cover ):
353
355
book .cover_etag = etag
354
356
book .save ()
355
357
else :
356
358
logger .debug (f"No Book Cover found for Book #{ book .id } " )
357
359
358
360
359
361
def download_all_books (
360
- download_cache : str ,
362
+ download_cache : Path ,
361
363
concurrency : int ,
362
364
languages : list [str ],
363
365
formats : list [str ],
364
- only_books : list [str ],
366
+ only_books : list [int ],
365
367
* ,
366
368
force : bool ,
367
369
s3_storage : KiwixStorage | None ,
@@ -372,7 +374,7 @@ def download_all_books(
372
374
)
373
375
374
376
# ensure dir exist
375
- Path ( download_cache ). mkdir_p ( )
377
+ download_cache . mkdir ( parents = True , exist_ok = True )
376
378
377
379
def backoff_busy_error_hdlr (details ):
378
380
logger .warning (
0 commit comments