Skip to content

Commit a8703b9

Browse files
committed
Replace usage of os.path and path.py with pathlib
1 parent a30026e commit a8703b9

File tree

11 files changed

+266
-247
lines changed

11 files changed

+266
-247
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ as of 2.0.0.
1515
### Changed
1616

1717
- Insert as few rsync URLs as possible in DB when a book selection is made (#220)
18+
- Replace usage of os.path and path.py with pathlib.Path (#195)
1819

1920
### Fixed
2021

@@ -102,7 +103,7 @@ as of 2.0.0.
102103
## [1.1.6]
103104

104105
- removed duplicate dependencies
105-
- Added tag _category:gutenberg which was missing
106+
- Added tag \_category:gutenberg which was missing
106107
- docker-only release with updated zimwriterfs (2.1.0-1)
107108

108109
## [1.1.5]

Diff for: src/gutenberg2zim/constants.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@
2121
logger = getLogger(NAME, level=logging.INFO)
2222

2323
TMP_FOLDER = "tmp"
24-
TMP_FOLDER_PATH = pathlib.Path(TMP_FOLDER)
24+
TMP_FOLDER_PATH = pathlib.Path(TMP_FOLDER).resolve()

Diff for: src/gutenberg2zim/database.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def load_fixtures(model):
211211
logger.debug(f"[fixtures] Created {f}")
212212

213213

214-
def setup_database(*, wipe=False):
214+
def setup_database(*, wipe: bool = False) -> None:
215215
logger.info("Setting up the database")
216216

217217
for model in (License, Author, Book, BookFormat, Url):

Diff for: src/gutenberg2zim/download.py

+48-46
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,15 @@
1-
import os
2-
import pathlib
31
import shutil
42
import tempfile
53
import zipfile
64
from multiprocessing.dummy import Pool
5+
from pathlib import Path
76
from pprint import pprint as pp
87

98
import apsw
109
import backoff
1110
from kiwixstorage import KiwixStorage
12-
from path import Path
1311

14-
from gutenberg2zim.constants import TMP_FOLDER, logger
12+
from gutenberg2zim.constants import TMP_FOLDER_PATH, logger
1513
from gutenberg2zim.database import Book, BookFormat
1614
from gutenberg2zim.export import fname_for, get_list_of_filtered_books
1715
from gutenberg2zim.s3 import download_from_cache
@@ -36,24 +34,24 @@
3634
# return False
3735

3836

39-
def handle_zipped_epub(zippath, book, dst_dir: pathlib.Path):
37+
def handle_zipped_epub(zippath: Path, book: Book, dst_dir: Path) -> bool:
4038
def clfn(fn):
41-
return os.path.join(*os.path.split(fn)[1:])
39+
return Path(fn).name
4240

4341
def is_safe(fname):
44-
fname = ensure_unicode(clfn(fname))
45-
if Path(fname).basename() == fname:
42+
name = ensure_unicode(clfn(fname))
43+
if Path(fname).name == name:
4644
return True
47-
return fname == os.path.join("images", Path(fname).splitpath()[-1])
45+
return fname == f"images/{Path(fname).name}"
4846

4947
zipped_files = []
5048
# create temp directory to extract to
51-
tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
49+
tmpd = tempfile.mkdtemp(dir=TMP_FOLDER_PATH)
5250
try:
5351
with zipfile.ZipFile(zippath, "r") as zf:
5452
# check that there is no insecure data (absolute names)
5553
if sum([1 for n in zf.namelist() if not is_safe(ensure_unicode(n))]):
56-
Path(tmpd).rmtree_p()
54+
shutil.rmtree(tmpd, ignore_errors=True)
5755
return False
5856
# zipped_files = [clfn(fn) for fn in zf.namelist()]
5957
zipped_files = zf.namelist()
@@ -64,7 +62,7 @@ def is_safe(fname):
6462
# file is not a zip file when it should be.
6563
# don't process it anymore as we don't know what to do.
6664
# could this be due to an incorrect/incomplete download?
67-
return
65+
return False
6866

6967
# is there multiple HTML files in ZIP ? (rare)
7068
mhtml = (
@@ -73,25 +71,26 @@ def is_safe(fname):
7371
# move all extracted files to proper locations
7472
for zipped_file in zipped_files:
7573
# skip folders
76-
if not Path(zipped_file).ext:
74+
if not Path(zipped_file).resolve().is_file():
7775
continue
7876

79-
src = os.path.join(tmpd, zipped_file)
80-
if os.path.exists(src):
81-
fname = Path(zipped_file).basename()
77+
src = (Path(tmpd) / zipped_file).resolve()
78+
if src.exists():
79+
fname = Path(zipped_file).name
8280

8381
if fname.endswith(".html") or fname.endswith(".htm"):
8482
if mhtml:
8583
if fname.startswith(f"{book.id}-h."):
86-
dst = dst_dir.joinpath(f"{book.id}.html")
84+
dst = dst_dir / f"{book.id}.html"
8785
else:
88-
dst = dst_dir.joinpath(f"{book.id}_{fname}")
86+
dst = dst_dir / f"{book.id}_{fname}"
8987
else:
90-
dst = dst_dir.joinpath(f"{book.id}.html")
88+
dst = dst_dir / f"{book.id}.html"
9189
else:
92-
dst = dst_dir.joinpath(f"{book.id}_{fname}")
90+
dst = dst_dir / f"{book.id}_{fname}"
91+
dst = dst.resolve()
9392
try:
94-
Path(src).move(str(dst))
93+
src.rename(dst)
9594
except Exception as e:
9695
import traceback
9796

@@ -100,14 +99,14 @@ def is_safe(fname):
10099
raise
101100

102101
# delete temp directory and zipfile
103-
if Path(zippath).exists():
104-
os.unlink(zippath)
105-
Path(tmpd).rmtree_p()
102+
zippath.unlink(missing_ok=True)
103+
shutil.rmtree(tmpd, ignore_errors=True)
104+
return True
106105

107106

108107
def download_book(
109108
book: Book,
110-
download_cache: str,
109+
download_cache: Path,
111110
formats: list[str],
112111
*,
113112
force: bool,
@@ -124,13 +123,15 @@ def download_book(
124123
if "html" not in formats:
125124
formats.append("html")
126125

127-
book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
128-
optimized_dir = book_dir.joinpath("optimized")
129-
unoptimized_dir = book_dir.joinpath("unoptimized")
126+
book_dir = download_cache / str(book.id)
127+
optimized_dir = book_dir / "optimized"
128+
unoptimized_dir = book_dir / "unoptimized"
129+
130130
unsuccessful_formats = []
131131
for book_format in formats:
132-
unoptimized_fpath = unoptimized_dir.joinpath(fname_for(book, book_format))
133-
optimized_fpath = optimized_dir.joinpath(archive_name_for(book, book_format))
132+
unoptimized_fpath = unoptimized_dir / fname_for(book, book_format)
133+
unoptimized_fpath = unoptimized_dir / fname_for(book, book_format)
134+
optimized_fpath = optimized_dir / archive_name_for(book, book_format)
134135

135136
# check if already downloaded
136137
if (unoptimized_fpath.exists() or optimized_fpath.exists()) and not force:
@@ -141,12 +142,10 @@ def download_book(
141142
if book_format == "html":
142143
for fpath in book_dir.iterdir():
143144
if fpath.is_file() and fpath.suffix not in [".pdf", ".epub"]:
144-
fpath.unlink()
145+
fpath.unlink(missing_ok=True)
145146
else:
146-
if unoptimized_fpath.exists():
147-
unoptimized_fpath.unlink()
148-
if optimized_fpath.exists():
149-
optimized_fpath.unlink()
147+
unoptimized_fpath.unlink(missing_ok=True)
148+
optimized_fpath.unlink(missing_ok=True)
150149
# delete dirs which are empty
151150
for dir_name in [optimized_dir, unoptimized_dir]:
152151
if not dir_name.exists():
@@ -233,7 +232,7 @@ def download_book(
233232

234233
# HTML files are *sometime* available as ZIP files
235234
if url.endswith(".zip"):
236-
zpath = unoptimized_dir.joinpath(f"{fname_for(book, book_format)}.zip")
235+
zpath = unoptimized_dir / f"{fname_for(book, book_format)}.zip"
237236

238237
etag = get_etag_from_url(url)
239238
if s3_storage:
@@ -254,7 +253,11 @@ def download_book(
254253
book.html_etag = etag # type: ignore
255254
book.save()
256255
# extract zipfile
257-
handle_zipped_epub(zippath=zpath, book=book, dst_dir=unoptimized_dir)
256+
handle_zipped_epub(
257+
zippath=zpath,
258+
book=book,
259+
dst_dir=unoptimized_dir,
260+
)
258261
else:
259262
if (
260263
url.endswith(".htm")
@@ -329,10 +332,9 @@ def download_cover(book, book_dir, s3_storage, optimizer_version):
329332
etag = get_etag_from_url(url)
330333
downloaded_from_cache = False
331334
cover = f"{book.id}_cover_image.jpg"
332-
if (
333-
book_dir.joinpath("optimized").joinpath(cover).exists()
334-
or book_dir.joinpath("unoptimized").joinpath(cover).exists()
335-
):
335+
if (book_dir / "optimized" / cover).exists() or (
336+
book_dir / "unoptimized" / cover
337+
).exists():
336338
logger.debug(f"Cover already exists for book #{book.id}")
337339
return
338340
if s3_storage:
@@ -343,25 +345,25 @@ def download_cover(book, book_dir, s3_storage, optimizer_version):
343345
book=book,
344346
etag=etag,
345347
book_format="cover",
346-
dest_dir=book_dir.joinpath("optimized"),
348+
dest_dir=book_dir / "optimized",
347349
s3_storage=s3_storage,
348350
optimizer_version=optimizer_version,
349351
)
350352
if not downloaded_from_cache:
351353
logger.debug(f"Downloading {url}")
352-
if download_file(url, book_dir.joinpath("unoptimized").joinpath(cover)):
354+
if download_file(url, book_dir / "unoptimized" / cover):
353355
book.cover_etag = etag
354356
book.save()
355357
else:
356358
logger.debug(f"No Book Cover found for Book #{book.id}")
357359

358360

359361
def download_all_books(
360-
download_cache: str,
362+
download_cache: Path,
361363
concurrency: int,
362364
languages: list[str],
363365
formats: list[str],
364-
only_books: list[str],
366+
only_books: list[int],
365367
*,
366368
force: bool,
367369
s3_storage: KiwixStorage | None,
@@ -372,7 +374,7 @@ def download_all_books(
372374
)
373375

374376
# ensure dir exist
375-
Path(download_cache).mkdir_p()
377+
download_cache.mkdir(parents=True, exist_ok=True)
376378

377379
def backoff_busy_error_hdlr(details):
378380
logger.warning(

Diff for: src/gutenberg2zim/entrypoint.py

+17-9
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
import logging
2-
import os
32
import sys
3+
from pathlib import Path
44

55
from docopt import docopt
6-
from path import Path
76

87
from gutenberg2zim.checkdeps import check_dependencies
98
from gutenberg2zim.constants import TMP_FOLDER_PATH, VERSION, logger
@@ -94,7 +93,12 @@ def main():
9493
arguments.get("--rdf-url")
9594
or "http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2"
9695
)
97-
dl_cache = arguments.get("--dl-folder") or os.path.join("dl-cache")
96+
97+
if dl_folder := arguments.get("--dl-folder"):
98+
dl_cache = Path(dl_folder).resolve()
99+
else:
100+
dl_cache = Path("dl-cache").resolve()
101+
98102
books_csv = arguments.get("--books") or ""
99103
zim_title = arguments.get("--zim-title")
100104
zim_desc = arguments.get("--zim-desc")
@@ -141,7 +145,7 @@ def main():
141145
}
142146
)
143147

144-
books = []
148+
books: list[int] = []
145149
try:
146150
books_csv = books_csv.split(",")
147151

@@ -151,7 +155,7 @@ def f(x):
151155
for i in books_csv:
152156
blst = f(i)
153157
if len(blst) > 1:
154-
blst = range(blst[0], blst[1] + 1)
158+
blst = list(range(blst[0], blst[1] + 1))
155159
books.extend(blst)
156160
books_csv = list(set(books))
157161
except Exception as e:
@@ -218,21 +222,25 @@ def f(x):
218222
for zim_lang in zims:
219223
if do_zim:
220224
logger.info("BUILDING ZIM dynamically")
225+
if one_lang_one_zim_folder:
226+
output_folder = Path(one_lang_one_zim_folder).resolve()
227+
else:
228+
output_folder = Path(".").resolve()
221229
build_zimfile(
222-
output_folder=Path(one_lang_one_zim_folder or ".").abspath(),
230+
output_folder=output_folder,
223231
download_cache=dl_cache,
224232
concurrency=concurrency,
225233
languages=zim_lang,
226234
formats=formats,
227235
only_books=books,
228-
force=force,
229-
title_search=title_search,
230-
add_bookshelves=bookshelves,
231236
s3_storage=s3_storage,
232237
optimizer_version=optimizer_version,
233238
zim_name=Path(zim_name).name if zim_name else None,
234239
title=zim_title,
235240
description=zim_desc,
236241
stats_filename=stats_filename,
237242
publisher=publisher,
243+
force=force,
244+
title_search=title_search,
245+
add_bookshelves=bookshelves,
238246
)

0 commit comments

Comments
 (0)