pgcorpus · alex-raw · Oct 8, 2022 · Oct 9, 2022
diff --git a/README.md b/README.md
@@ -1,22 +1,22 @@
 # Standardized Project Gutenberg Corpus
 Easily generate a local, up-to-date copy of the Standardized Project Gutenberg Corpus (SPGC).
 
-The Standardized Project Gutenberg Corpus was presented in 
+The Standardized Project Gutenberg Corpus was presented in
 
-[A standardized Project Gutenberg corpus for statistical analysis of natural language and quantitative linguistics](https://arxiv.org/abs/1812.08092)  
+[A standardized Project Gutenberg corpus for statistical analysis of natural language and quantitative linguistics](https://arxiv.org/abs/1812.08092)
 M. Gerlach, F. Font-Clos, arXiv:1812.08092, Dec 2018
 
-acompanied by a 'frozen' version of the corpus (SPGC-2018-07-18) as a Zenodo dataset: 
+accompanied by a 'frozen' version of the corpus (SPGC-2018-07-18) as a Zenodo data set:
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2422560.svg)](https://doi.org/10.5281/zenodo.2422560)
 
-SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that were part of Project Gutenbergh (PG) as of Jul 18, 2018, matching exactly those used in the paper. Since then, a few more thousands books have been added to PG, so if you want to exactly reproduce the results of the paper, then you should use SPGC-2018-07-18.
+SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that were part of Project Gutenberg (PG) as of Jul 18, 2018, matching exactly those used in the paper. Since then, a few more thousands books have been added to PG, so if you want to exactly reproduce the results of the paper, then you should use SPGC-2018-07-18.
 
 For **most other use cases**, however, you probably want the latest, most recent version of the corpus, in which case you should use this repository to **generate the corpus locally** on your computer. In particular, you will need to generate the corpus locally if you need to work with the original full text files in `raw/` and `text/`, since these are not included in the SPGC-2018-07-18 Zenodo dataset.
 
 
 ## Installation
-:warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).  
+:warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).
 
 Clone this repository
 

diff --git a/get_data.py b/get_data.py
@@ -11,9 +11,9 @@
 from src.bookshelves import parse_bookshelves
 
 import argparse
-import os
 import subprocess
 import pickle
+from pathlib import Path
 
 if __name__ == '__main__':
 
@@ -27,21 +27,21 @@
         "-m", "--mirror",
         help="Path to the mirror folder that will be updated via rsync.",
         default='data/.mirror/',
-        type=str)
+        type=Path)
 
     # raw dir
     parser.add_argument(
         "-r", "--raw",
         help="Path to the raw folder.",
         default='data/raw/',
-        type=str)
+        type=Path)
 
     # metadata dir
     parser.add_argument(
         "-M", "--metadata",
         help="Path to the metadata folder.",
         default='metadata/',
-        type=str)
+        type=Path)
 
     # pattern matching
     parser.add_argument(
@@ -62,7 +62,7 @@
         action="store_true",
         help="Overwrite files in raw.")
 
-    # quiet argument, to supress info
+    # quiet argument, to suppress info
     parser.add_argument(
         "-q", "--quiet",
         action="store_true",
@@ -73,12 +73,12 @@
     args = parser.parse_args()
 
     # check that all dirs exist
-    if not os.path.isdir(args.mirror):
-        raise ValueError("The specified mirror directory does not exist.")
-    if not os.path.isdir(args.raw):
-        raise ValueError("The specified raw directory does not exist.")
-    if not os.path.isdir(args.metadata):
-        raise ValueError("The specified metadata directory does not exist.")
+    if not args.mirror.is_dir():
+        raise NotADirectoryError("The specified mirror directory does not exist.")
+    if not args.raw.is_dir():
+        raise NotADirectoryError("The specified raw directory does not exist.")
+    if not args.metadata.is_dir():
+        raise NotADirectoryError("The specified metadata directory does not exist.")
 
     # Update the .mirror directory via rsync
     # --------------------------------------
@@ -117,7 +117,7 @@
     # Populate raw from mirror
     # ------------------------
     # We populate 'raw_dir' hardlinking to
-    # the hidden 'mirror_dir'. Names are standarized
+    # the hidden 'mirror_dir'. Names are standardized
     # into PG12345_raw.txt form.
     populate_raw_from_mirror(
         mirror_dir=args.mirror,
@@ -132,16 +132,16 @@
     # By default, update the whole metadata csv
     # file each time new data is downloaded.
     make_df_metadata(
-        path_xml=os.path.join(args.metadata, 'rdf-files.tar.bz2'),
-        path_out=os.path.join(args.metadata, 'metadata.csv'),
+        path_xml=args.metadata / 'rdf-files.tar.bz2',
+        path_out=args.metadata / 'metadata.csv',
         update=args.keep_rdf
         )
 
     # Bookshelves
     # -----------
     # Get bookshelves and their respective books and titles as dicts
     BS_dict, BS_num_to_category_str_dict = parse_bookshelves()
-    with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp:
+    with Path("metadata/bookshelves_ebooks_dict.pkl").open('wb') as fp:
         pickle.dump(BS_dict, fp)
-    with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp:
-        pickle.dump(BS_num_to_category_str_dict, fp)
+    with Path("metadata/bookshelves_categories_dict.pkl").open('wb') as fp:
+        pickle.dump(BS_num_to_category_str_dict, fp)
diff --git a/process_data.py b/process_data.py
@@ -5,12 +5,10 @@
 M. Gerlach and F. Font-Clos
 
 """
-import os
-from os.path import join
 import argparse
-import glob
 import ast
 import pandas as pd
+from pathlib import Path
 
 from src.pipeline import process_book
 from src.utils import get_langs_dict
@@ -26,33 +24,33 @@
         "-r", "--raw",
         help="Path to the raw-folder",
         default='data/raw/',
-        type=str)
+        type=Path)
     # text folder
     parser.add_argument(
         "-ote", "--output_text",
         help="Path to text-output (text_dir)",
         default='data/text/',
-        type=str)
+        type=Path)
     # tokens folder
     parser.add_argument(
         "-oto", "--output_tokens",
         help="Path to tokens-output (tokens_dir)",
         default='data/tokens/',
-        type=str)
+        type=Path)
     # counts folder
     parser.add_argument(
         "-oco", "--output_counts",
         help="Path to counts-output (counts_dir)",
         default='data/counts/',
-        type=str)
+        type=Path)
     # pattern to specify subset of books
     parser.add_argument(
         "-p", "--pattern",
         help="Patttern to specify a subset of books",
         default='*',
         type=str)
 
-    # quiet argument, to supress info
+    # quiet argument, to suppress info
     parser.add_argument(
         "-q", "--quiet",
         action="store_true",
@@ -64,19 +62,19 @@
         "-l", "--log_file",
         help="Path to log file",
         default=".log",
-        type=str)
+        type=Path)
 
     # add arguments to parser
     args = parser.parse_args()
 
     # check whether the out-put directories exist
-    if os.path.isdir(args.output_text) is False:
+    if not args.output_text.is_dir():
         raise ValueError("The directory for output of texts '%s' "
                          "does not exist" % (args.output_text))
-    if os.path.isdir(args.output_tokens) is False:
+    if not args.output_tokens.is_dir():
         raise ValueError("The directory for output of tokens '%s' "
                          "does not exist" % (args.output_tokens))
-    if os.path.isdir(args.output_counts) is False:
+    if not args.output_counts.is_dir():
         raise ValueError("The directory for output of counts '%s' "
                          "does not exist" % (args.output_counts))
 
@@ -88,16 +86,16 @@
 
     # loop over all books in the raw-folder
     pbooks = 0
-    for filename in glob.glob(join(args.raw, 'PG%s_raw.txt' % (args.pattern))):
-        # The process_books function will fail very rarely, whne
-        # a file tagged as UTf-8 is not really UTF-8. We kust
+    for file in args.raw.glob('PG%s_raw.txt' % (args.pattern)):
+        # The process_books function will fail very rarely, when
+        # a file tagged as UTF-8 is not really UTF-8. We just
         # skip those books.
         try:
             # get PG_id
-            PG_id = filename.split("/")[-1].split("_")[0]
+            PG_id = file.name.split("_")[0]
 
             # get language from metadata
-            # default is english
+            # default is English
             language = "english"
             # language is a string representing a list of languages codes
             lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0]
@@ -106,7 +104,7 @@
 
             # process the book: strip headers, tokenize, count
             process_book(
-                path_to_raw_file=filename,
+                path_to_raw_file=file,
                 text_dir=args.output_text,
                 tokens_dir=args.output_tokens,
                 counts_dir=args.output_counts,
@@ -118,10 +116,10 @@
                 print("Processed %d books..." % pbooks, end="\r")
         except UnicodeDecodeError:
             if not args.quiet:
-                print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename)
+                print("# WARNING: cannot process '%s' (encoding not UTF-8)" % str(file))
         except KeyError:
             if not args.quiet:
-                print("# WARNING: metadata for '%s' not found" % filename)
+                print("# WARNING: metadata for '%s' not found" % str(file))
         except Exception as e:
             if not args.quiet:
-                print("# WARNING: cannot process '%s' (unkown error)" % filename)
+                print("# WARNING: cannot process '%s' (unkown error)" % str(file))
diff --git a/src/bookshelves.py b/src/bookshelves.py
@@ -1,10 +1,9 @@
 # -*- coding: utf-8 -*-
 """Functions to download, parse and filter Gutenberg's bookshelves."""
 
-import os
-import glob
-import numpy as np
-import pandas as pd
+from itertools import chain
+from pathlib import Path
+import shutil
 import lxml.html
 import subprocess
 
@@ -19,57 +18,56 @@ def get_bookshelves():
 
     """
     sp_args = ["wget",
-               "--random-wait", "-r", 
-               "-p", "--no-parent", 
-               "-e", "robots=off", 
-               "-U", "mozilla", 
+               "--random-wait", "-r",
+               "-p", "--no-parent",
+               "-e", "robots=off",
+               "-U", "mozilla",
                "https://www.gutenberg.org/ebooks/bookshelf/"
                ]
     subprocess.call(sp_args)
 
     # move it to metadata dir
-    sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/"
-    subprocess.call(sp_args, shell=True)
+    new_dir = Path("metadata/bookshelves_html")
+    Path("www.gutenberg.org/ebooks/bookshelf").rename(new_dir)
 
     # cleanup
-    sp_args = ["rm", "-rf", "www.gutenberg.org"]
-    subprocess.call(sp_args)
+    shutil.rmtree("www.gutenberg.org")
+
     # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page)
     # we get also other files, copy of the bookshelves but with different ordering
     # remove them
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*.opds*"]
-    subprocess.call(sp_args)
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?sort*"]
-    subprocess.call(sp_args)
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?start*"]
-    subprocess.call(sp_args)
+    for file in chain(
+        new_dir.glob("*.opds*"),
+        new_dir.glob("*?sort*"),
+        new_dir.glob("*?start*")
+    ):
+        file.unlink()
+
     return None
 
-def parse_bookshelves():
+def parse_bookshelves(path=Path("metadata/bookshelves_html")):
     """
     Parse the bookshelves html files.
 
-    Builds up a dictionary of bookshelf_category:list(book_ids) and 
+    Builds up a dictionary of bookshelf_category:list(book_ids) and
     a dictionary of bookshelf_category:list(title_category)
     from the individual html files of each bs.
-    
+
     Prints the errors.
     """
     # parse the data
-    BS_paths = glob.glob("metadata/bookshelves_html/*")
-    BS = [path.split("/")[-1] for path in BS_paths]
-
+    (path / ".dummy").unlink() # prevent hidden dummy file to be parsed
     BS_dict = {}
     BS_num_to_category_str_dict = {}
-    for path in BS_paths:
-        bs = path.split("/")[-1]
+    for file in path.iterdir():
+        bs = file.name
         BS_dict[bs] = []
-        with open(path, "r", encoding="UTF-8") as foo:
+        with file.open("r", encoding="UTF-8") as foo:
             dom = lxml.html.fromstring(foo.read())
             # select the url in href for all a tags(links)
             for link in dom.xpath('//a/@href'):
                 # links to ebooks that are not searches
-                if link.find("ebooks") > -1 and link.find("search") == -1:
+                if "ebooks" in link and not "search" in link:
                     PGid = "PG"+link.split("/")[-1]
                     BS_dict[bs].append(PGid)
             # get title of the category
@@ -78,7 +76,7 @@ def parse_bookshelves():
             if len(title_categories) == 0:
                 # debug prints
                 print('No category title')
-                print(path, list(dom), dom.text_content())
+                print(file, list(dom), dom.text_content())
                 title_category = None
             elif len(title_categories) == 1:
                 title_category = title_categories[0].text

diff --git a/src/cleanup.py b/src/cleanup.py
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
-"""Taken from https://github.com/c-w/gutenberg/."""
+"""Based on https://github.com/c-w/gutenberg/."""
 
 from __future__ import unicode_literals
 import os
-import io
+
+from src.utils import get_PG_number
 
 
 def cleanup(path, text_dir):
@@ -12,18 +13,14 @@ def cleanup(path, text_dir):
 
     Parameters
     ----------
-    path : string
+    path : pathlib.Path
         Path to the PG****_raw.txt file
 
     """
-    PG_number = path.split("/")[-1].split("_")[0][2:]
-    with io.open(path) as f:
-        text = f.read()
-
-    clean = strip_headers(text)
-    source_file = os.path.join(text_dir, "PG%s_text.txt" % PG_number)
-    with io.open(source_file, "w") as f:
-        f.write(clean)
+    text = path.read_text()
+    PG_number = get_PG_number(path)
+    source_file = text_dir.pathjoin("PG%s_text.txt" % PG_number)
+    source_file.write_text(strip_headers(text))
 
 
 ############