Merge pull request #225 from elfkuzco/fix/use-pathlib

Replace usage of os.path and path.py with pathlib
openzim · Apr 30, 2024 · f344086 · f344086
2 parents a30026e + 3a8250c
commit f344086
Show file tree

Hide file tree

Showing 11 changed files with 265 additions and 246 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ as of 2.0.0.
 ### Changed
 
 - Insert as few rsync URLs as possible in DB when a book selection is made (#220)
+- Replace usage of os.path and path.py with pathlib.Path (#195)
 
 ### Fixed
 
@@ -102,7 +103,7 @@ as of 2.0.0.
 ## [1.1.6]
 
 - removed duplicate dependencies
-- Added tag _category:gutenberg which was missing
+- Added tag \_category:gutenberg which was missing
 - docker-only release with updated zimwriterfs (2.1.0-1)
 
 ## [1.1.5]

diff --git a/src/gutenberg2zim/constants.py b/src/gutenberg2zim/constants.py
@@ -21,4 +21,4 @@
 logger = getLogger(NAME, level=logging.INFO)
 
 TMP_FOLDER = "tmp"
-TMP_FOLDER_PATH = pathlib.Path(TMP_FOLDER)
+TMP_FOLDER_PATH = pathlib.Path(TMP_FOLDER).resolve()
diff --git a/src/gutenberg2zim/database.py b/src/gutenberg2zim/database.py
@@ -211,7 +211,7 @@ def load_fixtures(model):
  logger.debug(f"[fixtures] Created {f}")
 
 
-def setup_database(*, wipe=False):
+def setup_database(*, wipe: bool = False) -> None:
  logger.info("Setting up the database")
 
  for model in (License, Author, Book, BookFormat, Url):

diff --git a/src/gutenberg2zim/download.py b/src/gutenberg2zim/download.py
@@ -1,17 +1,15 @@
-import os
-import pathlib
 import shutil
 import tempfile
 import zipfile
 from multiprocessing.dummy import Pool
+from pathlib import Path
 from pprint import pprint as pp
 
 import apsw
 import backoff
 from kiwixstorage import KiwixStorage
-from path import Path
 
-from gutenberg2zim.constants import TMP_FOLDER, logger
+from gutenberg2zim.constants import TMP_FOLDER_PATH, logger
 from gutenberg2zim.database import Book, BookFormat
 from gutenberg2zim.export import fname_for, get_list_of_filtered_books
 from gutenberg2zim.s3 import download_from_cache
@@ -36,24 +34,24 @@
 # return False
 
 
-def handle_zipped_epub(zippath, book, dst_dir: pathlib.Path):
+def handle_zipped_epub(zippath: Path, book: Book, dst_dir: Path) -> bool:
  def clfn(fn):
- return os.path.join(*os.path.split(fn)[1:])
+ return Path(fn).name
 
  def is_safe(fname):
- fname = ensure_unicode(clfn(fname))
- if Path(fname).basename() == fname:
+ name = ensure_unicode(clfn(fname))
+ if Path(fname).name == name:
  return True
- return fname == os.path.join("images", Path(fname).splitpath()[-1])
+ return fname == f"images/{Path(fname).name}"
 
  zipped_files = []
  # create temp directory to extract to
- tmpd = tempfile.mkdtemp(dir=TMP_FOLDER)
+ tmpd = tempfile.mkdtemp(dir=TMP_FOLDER_PATH)
  try:
  with zipfile.ZipFile(zippath, "r") as zf:
  # check that there is no insecure data (absolute names)
  if sum([1 for n in zf.namelist() if not is_safe(ensure_unicode(n))]):
- Path(tmpd).rmtree_p()
+ shutil.rmtree(tmpd, ignore_errors=True)
  return False
  # zipped_files = [clfn(fn) for fn in zf.namelist()]
  zipped_files = zf.namelist()
@@ -64,7 +62,7 @@ def is_safe(fname):
  # file is not a zip file when it should be.
  # don't process it anymore as we don't know what to do.
  # could this be due to an incorrect/incomplete download?
- return
+ return False
 
  # is there multiple HTML files in ZIP ? (rare)
  mhtml = (
@@ -73,25 +71,26 @@ def is_safe(fname):
  # move all extracted files to proper locations
  for zipped_file in zipped_files:
  # skip folders
- if not Path(zipped_file).ext:
+ if not Path(zipped_file).is_file():
  continue
 
- src = os.path.join(tmpd, zipped_file)
- if os.path.exists(src):
- fname = Path(zipped_file).basename()
+ src = Path(tmpd) / zipped_file
+ if src.exists():
+ fname = Path(zipped_file).name
 
  if fname.endswith(".html") or fname.endswith(".htm"):
  if mhtml:
  if fname.startswith(f"{book.id}-h."):
- dst = dst_dir.joinpath(f"{book.id}.html")
+ dst = dst_dir / f"{book.id}.html"
  else:
- dst = dst_dir.joinpath(f"{book.id}_{fname}")
+ dst = dst_dir / f"{book.id}_{fname}"
  else:
- dst = dst_dir.joinpath(f"{book.id}.html")
+ dst = dst_dir / f"{book.id}.html"
  else:
- dst = dst_dir.joinpath(f"{book.id}_{fname}")
+ dst = dst_dir / f"{book.id}_{fname}"
+ dst = dst.resolve()
  try:
- Path(src).move(str(dst))
+ src.rename(dst)
  except Exception as e:
  import traceback
 
@@ -100,14 +99,14 @@ def is_safe(fname):
  raise
 
  # delete temp directory and zipfile
- if Path(zippath).exists():
-  os.unlink(zippath)
- Path(tmpd).rmtree_p()
+ zippath.unlink(missing_ok=True)
+ shutil.rmtree(tmpd, ignore_errors=True)
+ return True
 
 
 def download_book(
  book: Book,
- download_cache: str,
+ download_cache: Path,
  formats: list[str],
  *,
  force: bool,
@@ -124,13 +123,15 @@ def download_book(
  if "html" not in formats:
  formats.append("html")
 
- book_dir = pathlib.Path(download_cache).joinpath(str(book.id))
- optimized_dir = book_dir.joinpath("optimized")
- unoptimized_dir = book_dir.joinpath("unoptimized")
+ book_dir = download_cache / str(book.id)
+ optimized_dir = book_dir / "optimized"
+ unoptimized_dir = book_dir / "unoptimized"
+
  unsuccessful_formats = []
  for book_format in formats:
- unoptimized_fpath = unoptimized_dir.joinpath(fname_for(book, book_format))
- optimized_fpath = optimized_dir.joinpath(archive_name_for(book, book_format))
+ unoptimized_fpath = unoptimized_dir / fname_for(book, book_format)
+ unoptimized_fpath = unoptimized_dir / fname_for(book, book_format)
+ optimized_fpath = optimized_dir / archive_name_for(book, book_format)
 
  # check if already downloaded
  if (unoptimized_fpath.exists() or optimized_fpath.exists()) and not force:
@@ -141,12 +142,10 @@ def download_book(
  if book_format == "html":
  for fpath in book_dir.iterdir():
  if fpath.is_file() and fpath.suffix not in [".pdf", ".epub"]:
- fpath.unlink()
+ fpath.unlink(missing_ok=True)
  else:
- if unoptimized_fpath.exists():
- unoptimized_fpath.unlink()
- if optimized_fpath.exists():
- optimized_fpath.unlink()
+ unoptimized_fpath.unlink(missing_ok=True)
+ optimized_fpath.unlink(missing_ok=True)
  # delete dirs which are empty
  for dir_name in [optimized_dir, unoptimized_dir]:
  if not dir_name.exists():
@@ -233,7 +232,7 @@ def download_book(
 
  # HTML files are *sometime* available as ZIP files
  if url.endswith(".zip"):
- zpath = unoptimized_dir.joinpath(f"{fname_for(book, book_format)}.zip")
+ zpath = unoptimized_dir / f"{fname_for(book, book_format)}.zip"
 
  etag = get_etag_from_url(url)
  if s3_storage:
@@ -254,7 +253,11 @@ def download_book(
  book.html_etag = etag # type: ignore
  book.save()
  # extract zipfile
- handle_zipped_epub(zippath=zpath, book=book, dst_dir=unoptimized_dir)
+ handle_zipped_epub(
+ zippath=zpath,
+ book=book,
+ dst_dir=unoptimized_dir,
+ )
  else:
  if (
  url.endswith(".htm")
@@ -329,10 +332,9 @@ def download_cover(book, book_dir, s3_storage, optimizer_version):
  etag = get_etag_from_url(url)
  downloaded_from_cache = False
  cover = f"{book.id}_cover_image.jpg"
- if (
- book_dir.joinpath("optimized").joinpath(cover).exists()
- or book_dir.joinpath("unoptimized").joinpath(cover).exists()
- ):
+ if (book_dir / "optimized" / cover).exists() or (
+ book_dir / "unoptimized" / cover
+ ).exists():
  logger.debug(f"Cover already exists for book #{book.id}")
  return
  if s3_storage:
@@ -343,25 +345,25 @@ def download_cover(book, book_dir, s3_storage, optimizer_version):
  book=book,
  etag=etag,
  book_format="cover",
- dest_dir=book_dir.joinpath("optimized"),
+ dest_dir=book_dir / "optimized",
  s3_storage=s3_storage,
  optimizer_version=optimizer_version,
  )
  if not downloaded_from_cache:
  logger.debug(f"Downloading {url}")
- if download_file(url, book_dir.joinpath("unoptimized").joinpath(cover)):
+ if download_file(url, book_dir / "unoptimized" / cover):
  book.cover_etag = etag
  book.save()
  else:
  logger.debug(f"No Book Cover found for Book #{book.id}")
 
 
 def download_all_books(
- download_cache: str,
+ download_cache: Path,
  concurrency: int,
  languages: list[str],
  formats: list[str],
- only_books: list[str],
+ only_books: list[int],
  *,
  force: bool,
  s3_storage: KiwixStorage | None,
@@ -372,7 +374,7 @@ def download_all_books(
  )
 
  # ensure dir exist
- Path(download_cache).mkdir_p()
+ download_cache.mkdir(parents=True, exist_ok=True)
 
  def backoff_busy_error_hdlr(details):
  logger.warning(

diff --git a/src/gutenberg2zim/entrypoint.py b/src/gutenberg2zim/entrypoint.py
@@ -1,9 +1,8 @@
 import logging
-import os
 import sys
+from pathlib import Path
 
 from docopt import docopt
-from path import Path
 
 from gutenberg2zim.checkdeps import check_dependencies
 from gutenberg2zim.constants import TMP_FOLDER_PATH, VERSION, logger
@@ -94,7 +93,12 @@ def main():
  arguments.get("--rdf-url")
  or "http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2"
  )
- dl_cache = arguments.get("--dl-folder") or os.path.join("dl-cache")
+
+ if dl_folder := arguments.get("--dl-folder"):
+ dl_cache = Path(dl_folder).resolve()
+ else:
+ dl_cache = Path("dl-cache").resolve()
+
  books_csv = arguments.get("--books") or ""
  zim_title = arguments.get("--zim-title")
  zim_desc = arguments.get("--zim-desc")
@@ -141,7 +145,7 @@ def main():
  }
  )
 
- books = []
+ books: list[int] = []
  try:
  books_csv = books_csv.split(",")
 
@@ -151,7 +155,7 @@ def f(x):
  for i in books_csv:
  blst = f(i)
  if len(blst) > 1:
- blst = range(blst[0], blst[1] + 1)
+ blst = list(range(blst[0], blst[1] + 1))
  books.extend(blst)
  books_csv = list(set(books))
  except Exception as e:
@@ -219,20 +223,22 @@ def f(x):
  if do_zim:
  logger.info("BUILDING ZIM dynamically")
  build_zimfile(
- output_folder=Path(one_lang_one_zim_folder or ".").abspath(),
+ output_folder=Path(one_lang_one_zim_folder).resolve()
+ if one_lang_one_zim_folder
+ else Path(".").resolve(),
  download_cache=dl_cache,
  concurrency=concurrency,
  languages=zim_lang,
  formats=formats,
  only_books=books,
- force=force,
- title_search=title_search,
- add_bookshelves=bookshelves,
  s3_storage=s3_storage,
  optimizer_version=optimizer_version,
  zim_name=Path(zim_name).name if zim_name else None,
  title=zim_title,
  description=zim_desc,
  stats_filename=stats_filename,
  publisher=publisher,
+ force=force,
+ title_search=title_search,
+ add_bookshelves=bookshelves,
  )