diff --git a/README.md b/README.md index 7bd926f..33d7668 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,22 @@ # Standardized Project Gutenberg Corpus Easily generate a local, up-to-date copy of the Standardized Project Gutenberg Corpus (SPGC). -The Standardized Project Gutenberg Corpus was presented in +The Standardized Project Gutenberg Corpus was presented in -[A standardized Project Gutenberg corpus for statistical analysis of natural language and quantitative linguistics](https://arxiv.org/abs/1812.08092) +[A standardized Project Gutenberg corpus for statistical analysis of natural language and quantitative linguistics](https://arxiv.org/abs/1812.08092) M. Gerlach, F. Font-Clos, arXiv:1812.08092, Dec 2018 -acompanied by a 'frozen' version of the corpus (SPGC-2018-07-18) as a Zenodo dataset: +accompanied by a 'frozen' version of the corpus (SPGC-2018-07-18) as a Zenodo data set: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2422560.svg)](https://doi.org/10.5281/zenodo.2422560) -SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that were part of Project Gutenbergh (PG) as of Jul 18, 2018, matching exactly those used in the paper. Since then, a few more thousands books have been added to PG, so if you want to exactly reproduce the results of the paper, then you should use SPGC-2018-07-18. +SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that were part of Project Gutenberg (PG) as of Jul 18, 2018, matching exactly those used in the paper. Since then, a few more thousands books have been added to PG, so if you want to exactly reproduce the results of the paper, then you should use SPGC-2018-07-18. For **most other use cases**, however, you probably want the latest, most recent version of the corpus, in which case you should use this repository to **generate the corpus locally** on your computer. In particular, you will need to generate the corpus locally if you need to work with the original full text files in `raw/` and `text/`, since these are not included in the SPGC-2018-07-18 Zenodo dataset. ## Installation -:warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/). +:warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/). Clone this repository diff --git a/get_data.py b/get_data.py index 5012e68..67a3c4f 100644 --- a/get_data.py +++ b/get_data.py @@ -11,9 +11,9 @@ from src.bookshelves import parse_bookshelves import argparse -import os import subprocess import pickle +from pathlib import Path if __name__ == '__main__': @@ -27,21 +27,21 @@ "-m", "--mirror", help="Path to the mirror folder that will be updated via rsync.", default='data/.mirror/', - type=str) + type=Path) # raw dir parser.add_argument( "-r", "--raw", help="Path to the raw folder.", default='data/raw/', - type=str) + type=Path) # metadata dir parser.add_argument( "-M", "--metadata", help="Path to the metadata folder.", default='metadata/', - type=str) + type=Path) # pattern matching parser.add_argument( @@ -62,7 +62,7 @@ action="store_true", help="Overwrite files in raw.") - # quiet argument, to supress info + # quiet argument, to suppress info parser.add_argument( "-q", "--quiet", action="store_true", @@ -73,12 +73,12 @@ args = parser.parse_args() # check that all dirs exist - if not os.path.isdir(args.mirror): - raise ValueError("The specified mirror directory does not exist.") - if not os.path.isdir(args.raw): - raise ValueError("The specified raw directory does not exist.") - if not os.path.isdir(args.metadata): - raise ValueError("The specified metadata directory does not exist.") + if not args.mirror.is_dir(): + raise NotADirectoryError("The specified mirror directory does not exist.") + if not args.raw.is_dir(): + raise NotADirectoryError("The specified raw directory does not exist.") + if not args.metadata.is_dir(): + raise NotADirectoryError("The specified metadata directory does not exist.") # Update the .mirror directory via rsync # -------------------------------------- @@ -117,7 +117,7 @@ # Populate raw from mirror # ------------------------ # We populate 'raw_dir' hardlinking to - # the hidden 'mirror_dir'. Names are standarized + # the hidden 'mirror_dir'. Names are standardized # into PG12345_raw.txt form. populate_raw_from_mirror( mirror_dir=args.mirror, @@ -132,8 +132,8 @@ # By default, update the whole metadata csv # file each time new data is downloaded. make_df_metadata( - path_xml=os.path.join(args.metadata, 'rdf-files.tar.bz2'), - path_out=os.path.join(args.metadata, 'metadata.csv'), + path_xml=args.metadata / 'rdf-files.tar.bz2', + path_out=args.metadata / 'metadata.csv', update=args.keep_rdf ) @@ -141,7 +141,7 @@ # ----------- # Get bookshelves and their respective books and titles as dicts BS_dict, BS_num_to_category_str_dict = parse_bookshelves() - with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp: + with Path("metadata/bookshelves_ebooks_dict.pkl").open('wb') as fp: pickle.dump(BS_dict, fp) - with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp: - pickle.dump(BS_num_to_category_str_dict, fp) \ No newline at end of file + with Path("metadata/bookshelves_categories_dict.pkl").open('wb') as fp: + pickle.dump(BS_num_to_category_str_dict, fp) diff --git a/process_data.py b/process_data.py index e21b92f..e1b0d70 100644 --- a/process_data.py +++ b/process_data.py @@ -5,12 +5,10 @@ M. Gerlach and F. Font-Clos """ -import os -from os.path import join import argparse -import glob import ast import pandas as pd +from pathlib import Path from src.pipeline import process_book from src.utils import get_langs_dict @@ -26,25 +24,25 @@ "-r", "--raw", help="Path to the raw-folder", default='data/raw/', - type=str) + type=Path) # text folder parser.add_argument( "-ote", "--output_text", help="Path to text-output (text_dir)", default='data/text/', - type=str) + type=Path) # tokens folder parser.add_argument( "-oto", "--output_tokens", help="Path to tokens-output (tokens_dir)", default='data/tokens/', - type=str) + type=Path) # counts folder parser.add_argument( "-oco", "--output_counts", help="Path to counts-output (counts_dir)", default='data/counts/', - type=str) + type=Path) # pattern to specify subset of books parser.add_argument( "-p", "--pattern", @@ -52,7 +50,7 @@ default='*', type=str) - # quiet argument, to supress info + # quiet argument, to suppress info parser.add_argument( "-q", "--quiet", action="store_true", @@ -64,19 +62,19 @@ "-l", "--log_file", help="Path to log file", default=".log", - type=str) + type=Path) # add arguments to parser args = parser.parse_args() # check whether the out-put directories exist - if os.path.isdir(args.output_text) is False: + if not args.output_text.is_dir(): raise ValueError("The directory for output of texts '%s' " "does not exist" % (args.output_text)) - if os.path.isdir(args.output_tokens) is False: + if not args.output_tokens.is_dir(): raise ValueError("The directory for output of tokens '%s' " "does not exist" % (args.output_tokens)) - if os.path.isdir(args.output_counts) is False: + if not args.output_counts.is_dir(): raise ValueError("The directory for output of counts '%s' " "does not exist" % (args.output_counts)) @@ -88,16 +86,16 @@ # loop over all books in the raw-folder pbooks = 0 - for filename in glob.glob(join(args.raw, 'PG%s_raw.txt' % (args.pattern))): - # The process_books function will fail very rarely, whne - # a file tagged as UTf-8 is not really UTF-8. We kust + for file in args.raw.glob('PG%s_raw.txt' % (args.pattern)): + # The process_books function will fail very rarely, when + # a file tagged as UTF-8 is not really UTF-8. We just # skip those books. try: # get PG_id - PG_id = filename.split("/")[-1].split("_")[0] + PG_id = file.name.split("_")[0] # get language from metadata - # default is english + # default is English language = "english" # language is a string representing a list of languages codes lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0] @@ -106,7 +104,7 @@ # process the book: strip headers, tokenize, count process_book( - path_to_raw_file=filename, + path_to_raw_file=file, text_dir=args.output_text, tokens_dir=args.output_tokens, counts_dir=args.output_counts, @@ -118,10 +116,10 @@ print("Processed %d books..." % pbooks, end="\r") except UnicodeDecodeError: if not args.quiet: - print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename) + print("# WARNING: cannot process '%s' (encoding not UTF-8)" % str(file)) except KeyError: if not args.quiet: - print("# WARNING: metadata for '%s' not found" % filename) + print("# WARNING: metadata for '%s' not found" % str(file)) except Exception as e: if not args.quiet: - print("# WARNING: cannot process '%s' (unkown error)" % filename) + print("# WARNING: cannot process '%s' (unkown error)" % str(file)) diff --git a/src/bookshelves.py b/src/bookshelves.py index 62e31aa..3a72c40 100644 --- a/src/bookshelves.py +++ b/src/bookshelves.py @@ -1,10 +1,9 @@ # -*- coding: utf-8 -*- """Functions to download, parse and filter Gutenberg's bookshelves.""" -import os -import glob -import numpy as np -import pandas as pd +from itertools import chain +from pathlib import Path +import shutil import lxml.html import subprocess @@ -19,57 +18,56 @@ def get_bookshelves(): """ sp_args = ["wget", - "--random-wait", "-r", - "-p", "--no-parent", - "-e", "robots=off", - "-U", "mozilla", + "--random-wait", "-r", + "-p", "--no-parent", + "-e", "robots=off", + "-U", "mozilla", "https://www.gutenberg.org/ebooks/bookshelf/" ] subprocess.call(sp_args) # move it to metadata dir - sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/" - subprocess.call(sp_args, shell=True) + new_dir = Path("metadata/bookshelves_html") + Path("www.gutenberg.org/ebooks/bookshelf").rename(new_dir) # cleanup - sp_args = ["rm", "-rf", "www.gutenberg.org"] - subprocess.call(sp_args) + shutil.rmtree("www.gutenberg.org") + # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page) # we get also other files, copy of the bookshelves but with different ordering # remove them - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*.opds*"] - subprocess.call(sp_args) - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?sort*"] - subprocess.call(sp_args) - sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?start*"] - subprocess.call(sp_args) + for file in chain( + new_dir.glob("*.opds*"), + new_dir.glob("*?sort*"), + new_dir.glob("*?start*") + ): + file.unlink() + return None -def parse_bookshelves(): +def parse_bookshelves(path=Path("metadata/bookshelves_html")): """ Parse the bookshelves html files. - Builds up a dictionary of bookshelf_category:list(book_ids) and + Builds up a dictionary of bookshelf_category:list(book_ids) and a dictionary of bookshelf_category:list(title_category) from the individual html files of each bs. - + Prints the errors. """ # parse the data - BS_paths = glob.glob("metadata/bookshelves_html/*") - BS = [path.split("/")[-1] for path in BS_paths] - + (path / ".dummy").unlink() # prevent hidden dummy file to be parsed BS_dict = {} BS_num_to_category_str_dict = {} - for path in BS_paths: - bs = path.split("/")[-1] + for file in path.iterdir(): + bs = file.name BS_dict[bs] = [] - with open(path, "r", encoding="UTF-8") as foo: + with file.open("r", encoding="UTF-8") as foo: dom = lxml.html.fromstring(foo.read()) # select the url in href for all a tags(links) for link in dom.xpath('//a/@href'): # links to ebooks that are not searches - if link.find("ebooks") > -1 and link.find("search") == -1: + if "ebooks" in link and not "search" in link: PGid = "PG"+link.split("/")[-1] BS_dict[bs].append(PGid) # get title of the category @@ -78,7 +76,7 @@ def parse_bookshelves(): if len(title_categories) == 0: # debug prints print('No category title') - print(path, list(dom), dom.text_content()) + print(file, list(dom), dom.text_content()) title_category = None elif len(title_categories) == 1: title_category = title_categories[0].text diff --git a/src/cleanup.py b/src/cleanup.py index 69967f6..aa151d6 100644 --- a/src/cleanup.py +++ b/src/cleanup.py @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- -"""Taken from https://github.com/c-w/gutenberg/.""" +"""Based on https://github.com/c-w/gutenberg/.""" from __future__ import unicode_literals import os -import io + +from src.utils import get_PG_number def cleanup(path, text_dir): @@ -12,18 +13,14 @@ def cleanup(path, text_dir): Parameters ---------- - path : string + path : pathlib.Path Path to the PG****_raw.txt file """ - PG_number = path.split("/")[-1].split("_")[0][2:] - with io.open(path) as f: - text = f.read() - - clean = strip_headers(text) - source_file = os.path.join(text_dir, "PG%s_text.txt" % PG_number) - with io.open(source_file, "w") as f: - f.write(clean) + text = path.read_text() + PG_number = get_PG_number(path) + source_file = text_dir.pathjoin("PG%s_text.txt" % PG_number) + source_file.write_text(strip_headers(text)) ############ diff --git a/src/metadataparser.py b/src/metadataparser.py index 16c15eb..df675af 100644 --- a/src/metadataparser.py +++ b/src/metadataparser.py @@ -5,13 +5,13 @@ Based on https://bitbucket.org/c-w/gutenberg/ """ -import os import re import tarfile import urllib import urllib.request import pandas as pd +from pathlib import Path import xml.etree.cElementTree as ElementTree try: import cPickle as pickle @@ -19,9 +19,9 @@ import pickle # The Python dict produced by this module -# PICKLEFILE = '../data/metadata/md.pickle.gz' +# PICKLEFILE = path('../data/metadata/md.pickle.gz') # The catalog downloaded from Gutenberg -RDFFILES = '../data/metadata/rdf-files.tar.bz2' +RDFFILES = Path('../data/metadata/rdf-files.tar.bz2') META_FIELDS = ('id', 'author', 'title', 'downloads', 'formats', 'type', 'LCC', 'subjects', 'authoryearofbirth', 'authoryearofdeath', 'language' ) @@ -40,8 +40,8 @@ ''', re.IGNORECASE | re.VERBOSE) -def make_df_metadata(path_xml='../metadata/rdf-files.tar.bz2', - path_out='../metadata/metadata.csv', +def make_df_metadata(path_xml=Path('../metadata/rdf-files.tar.bz2'), + path_out=Path('../metadata/metadata.csv'), update=False): """ Write metadata in a csv. @@ -51,10 +51,10 @@ def make_df_metadata(path_xml='../metadata/rdf-files.tar.bz2', Parameters ---------- - path_xml : str + path_xml : pathlib.Path Location of the rdf-file. If it does not exist, we download it from http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2 - path_out : str + path_out : pathlib.Path Where to save csv-file. update : bool (False) Download the latest rdf-file even if it already @@ -111,7 +111,7 @@ def readmetadata(RDFFILES, update=False): http://www.gutenberg.org/wiki/Gutenberg:Help_on_Bibliographic_Record_Page """ - # if os.path.exists(PICKLEFILE): + # if PICKLEFILE.exists(): # metadata = pickle.load(gzip.open(PICKLEFILE, 'rb')) # else: metadata = {} @@ -136,7 +136,7 @@ def getrdfdata(RDFFILES, update=False): An etext meta-data definition. """ - if (not os.path.exists(RDFFILES)) or (update is True): + if update is True or not RDFFILES.exists(): # standard location of rdf files try: RDFURL = "http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2" @@ -167,7 +167,7 @@ def parsemetadata(ebook): result = dict.fromkeys(META_FIELDS) # get etext no about = ebook.get('{%(rdf)s}about' % NS) - result['id'] = int(os.path.basename(about)) + result['id'] = int(Path(about).name) # author creator = ebook.find('.//{%(dc)s}creator' % NS) if creator is not None: diff --git a/src/metaquery.py b/src/metaquery.py index a9df30b..1878c77 100644 --- a/src/metaquery.py +++ b/src/metaquery.py @@ -9,12 +9,11 @@ """ -import os import pandas as pd import numpy as np from collections import Counter +from pathlib import Path import re -import glob class meta_query(object): @@ -24,11 +23,9 @@ def __init__(self, path='../metadata/metadata.csv', filter_exist=True): self.df = pd.read_csv(path) ## the dataframe on which we apply filters if filter_exist == True: ## filter the books for which we have the data - path_text = os.path.abspath(os.path.join(path,os.pardir,os.pardir,'data','text')) - list_files = [] - for file in list(glob.glob( path_text+'/PG*_text.txt' )): - list_files += [file] - list_ids = sorted([ h.split('/')[-1].split('_text')[0] for h in list_files ]) + path_text = Path(path).absolute().parents[1] / 'data' / 'text' + list_ids = [file.name.split('_text')[0] for file in path_text.glob('PG*_text.txt')] + df = self.df df_new = df[df['id'].isin(list_ids)] self.df = df_new @@ -108,11 +105,11 @@ def filter_subject(self,subject_sel,how='only'): ### TIME def filter_year(self,y_sel,hmin=20): ''' - We filter all books, where + We filter all books, where - authoryearofbirth <= y_sel - hmin - authoryearofdeath > y_sel - Note: - - 1842 books with only authoryearofbirth + Note: + - 1842 books with only authoryearofbirth - 847 books with only authoryearofdeath - 13996 books missing both ''' @@ -124,7 +121,7 @@ def filter_year(self,y_sel,hmin=20): ### AUTHOR def filter_author(self,s_sel): - s = self.df[ self.df['author'].str.contains(re.escape(s_sel),case=False).replace(np.nan,False)] + s = self.df[ self.df['author'].str.contains(re.escape(s_sel),case=False).replace(np.nan,False)] self.df = s ### Sort by the n most downloaded diff --git a/src/pipeline.py b/src/pipeline.py index 5e89c56..c75f41d 100644 --- a/src/pipeline.py +++ b/src/pipeline.py @@ -1,9 +1,8 @@ # -*- coding: utf-8 -*- +from src.utils import get_PG_number from .cleanup import strip_headers from .tokenizer import tokenize_text from collections import Counter -import io -import os def process_book( path_to_raw_file=None, @@ -14,7 +13,7 @@ def process_book( cleanup_f=strip_headers, overwrite_all=False, language="english", - log_file="" + log_file=None ): """ Process a book, from raw data to counts. @@ -31,68 +30,68 @@ def process_book( Overwrite policy ---------------- - By default a book is processed in full except if all the + By default a book is processed in full except if all the files already exist (raw,text,tokens and counts). The overwrite_all - keyword can cahnge this behaviour. + keyword can change this behaviour. Parameters ---------- overwrite_all : bool - If set to True, everything is processed regargless of existing files. + If set to True, everything is processed regardless of existing files. """ if text_dir is None: raise ValueError("You must specify a path to save the text files.") - + if tokens_dir is None: raise ValueError("You must specify a path to save the tokens files.") - + if counts_dir is None: raise ValueError("You must specify a path to save the counts files.") - + if path_to_raw_file is None: raise ValueError("You must specify a path to the raw file to process.") - - # get PG number - PG_number = path_to_raw_file.split("/")[-1].split("_")[0][2:] - - if overwrite_all or\ - (not os.path.isfile(os.path.join(text_dir,"PG%s_text.txt"%PG_number))) or \ - (not os.path.isfile(os.path.join(tokens_dir,"PG%s_tokens.txt"%PG_number))) or \ - (not os.path.isfile(os.path.join(counts_dir,"PG%s_counts.txt"%PG_number))): + + PG_number = get_PG_number(path_to_raw_file) + text_path = text_dir / ("PG%s_text.txt" % PG_number) + tokens_path = tokens_dir / ("PG%s_tokens.txt" % PG_number) + counts_path = counts_dir / ("PG%s_counts.txt" % PG_number) + + if overwrite_all or not \ + all(f.is_file() for f in [text_path, tokens_path, counts_path]): # read raw file - with io.open(path_to_raw_file, encoding="UTF-8") as f: - text = f.read() + text = path_to_raw_file.read_text(encoding="UTF-8") # clean it up clean = cleanup_f(text) # write text file - target_file = os.path.join(text_dir,"PG%s_text.txt"%PG_number) - with io.open(target_file,"w", encoding="UTF-8") as f: - f.write(clean) + text_path.write_text(clean, encoding="UTF-8") # compute tokens tokens = tokenize_f(clean, language=language) - + # write tokens file - target_file = os.path.join(tokens_dir,"PG%s_tokens.txt"%PG_number) - with io.open(target_file,"w", encoding="UTF-8") as f: - f.write("\n".join(tokens)+"\n") + tokens_path.write_text("\n".join(tokens)+"\n", encoding="UTF-8") # compute counts counts = Counter(tokens) - + # write counts file - target_file = os.path.join(counts_dir,"PG%s_counts.txt"%PG_number) - with io.open(target_file,"w", encoding="UTF-8") as f: - f.write("\n".join([w+"\t"+str(c) for w,c in counts.most_common()])+"\n") + counts = "\n".join([w+"\t"+str(c) for w,c in counts.most_common()])+"\n" + counts_path.write_text(counts, encoding="UTF-8") # write log info if log_file is not None - if log_file != "": - raw_nl = text.count("\n") - clean_nl = clean.count("\n") - L = len(tokens) - V = len(counts) - with io.open(log_file, "a") as f: - f.write("PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n") - + if log_file is None: + return + + log_data=[ + "PG"+PG_number, + language, + text.count("\n"), + clean.count("\n"), + len(tokens), + len(counts), + ] + with log_file.open("a") as f: + f.write('\t'.join(map(str, log_data))+"\n") + diff --git a/src/tokenizer.py b/src/tokenizer.py index b25f21e..05acefa 100644 --- a/src/tokenizer.py +++ b/src/tokenizer.py @@ -13,10 +13,10 @@ def tokenize_text(text, language="english"): '''Tokenize a string into a list of tokens. - Use NLTK's Treebankwordtokenizer. + Use NLTK's TreebankWordTokenizer. Note that we first split into sentences using NLTK's sent_tokenize. We additionally call a filtering function to remove un-wanted tokens. - + IN: - text, str OUT: @@ -24,10 +24,10 @@ def tokenize_text(text, language="english"): ''' ## list of tokens list_tokens = [] - + ## split text into sentences sentences=sent_tokenize(text, language=language) - + ## define the tokenizer tokenizer = TreebankWordTokenizer() ## loop over all sentences diff --git a/src/utils.py b/src/utils.py index ceea2ea..360a756 100644 --- a/src/utils.py +++ b/src/utils.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -import os -import shutil -import subprocess -import glob + +import re + +NUMBER_RE = re.compile(r'\d+') def get_langs_dict(): """ - A dictionary mapping languages codes to full languages names + A dictionary mapping language codes to full language names """ langs_dict = { "cs": "czech", @@ -28,33 +28,22 @@ def get_langs_dict(): } return langs_dict -def get_PG_number(string): +def get_PG_number(file): """ Simply gets the PG number from different possible text files. Patterns are: 12345-0.txt or pg12345.txt.utf8 """ - # 12345-0.txt - if string.find("-0.txt")>-1: - PG_number = string.replace("-0.txt","") - - # pg12345.txt.utf8 - elif string.find(".txt.utf8")>-1: - PG_number = string.replace(".txt.utf8","").replace("pg","") + # stem removes only one ending, second is always '.txt' + PG_number = NUMBER_RE.search(file.stem) + assert PG_number is not None, file + "\n" + PG_number + return PG_number.group() - if not PG_number.isnumeric(): - print(string) - print(PG_number,"\n") - assert PG_number.isnumeric() - return PG_number - -def list_duplicates_in_mirror( - mirror_dir = None, - ): +def list_duplicates_in_mirror(mirror_dir): """ Look for duplicates in 'mirror_dir', and list them. Typical case is, there's two files corresponding to the - same PG identificator: + same PG identifier: 1) mirror/1/2/3/4/12345/12345-0.txt 2) mirror/cache/epub/12345/pg12345.txt.utf-8 @@ -62,22 +51,20 @@ def list_duplicates_in_mirror( We populate 1) and list 2) as a duplicate """ dups_list = [] - for dirName, subdirList, fileList in os.walk(mirror_dir): - for matchpath in glob.iglob(os.path.join(dirName,"*-0.txt")): - fname = matchpath.split("/")[-1] - # fname must have exactly one "." and one "-" - if (len(fname.split("."))==2 and len(fname.split("-"))==2): - PGnumber = get_PG_number(fname) - possible_duplicate = os.path.join(mirror_dir,"cache","epub",PGnumber,"pg"+PGnumber+".txt.utf8") - if os.path.isfile(possible_duplicate): - dups_list.append(possible_duplicate) + for file in mirror_dir.rglob("*-0.txt"): + # file.name must have exactly one "." and one "-" + if (file.name.count(".") == 1 and file.name.count("-") == 1): + PGnumber = get_PG_number(file) + possible_duplicate = mirror_dir / "cache" / "epub" / PGnumber / ("pg"+PGnumber+".txt.utf8") + if possible_duplicate.is_file(): + dups_list.append(possible_duplicate) return dups_list -def populate_raw_from_mirror(mirror_dir=None, - raw_dir=None, +def populate_raw_from_mirror(mirror_dir, + raw_dir, + dups_list, overwrite=False, - dups_list=None, quiet=False): """ Populate the raw/ directory using the .mirror/ directory. @@ -90,34 +77,27 @@ def populate_raw_from_mirror(mirror_dir=None, Parameters ---------- + mirror_dir : pathlib.Path + raw_dir : pathlib.Path overwrite : bool Whether to overwrite files in raw. - dups_list : list of strings + dups_list : list of strings A list of duplicates produced by list_duplicates_in_mirror. Files in this list are not copied into raw. """ - for dirName, subdirList, fileList in os.walk(mirror_dir): - # patterns to match are 12345-0.txt or pg12345.txt.utf8 - for matchpath in glob.iglob(os.path.join(dirName, "[p123456789][g0123456789][0-9]*")): - fname = matchpath.split("/")[-1] - # check that file is not in dups_list - if matchpath not in dups_list: - # avoid files with more "." or "-" than expected - if (len(fname.split("."))==2 and len(fname.split("-"))==2 and fname[-6::]=="-0.txt")\ - or (len(fname.split("."))==3 and len(fname.split("-"))==1 and fname[-9::]==".txt.utf8"): - # get PG number - PGnumber = get_PG_number(fname) - - source = os.path.join(dirName, fname) - target = os.path.join(raw_dir, "PG"+PGnumber+"_raw.txt") - - if (not os.path.isfile(target)) or overwrite: - subprocess.call(["ln", "-f", source, target]) - - # if file was not in dupes list and we are not quiet - elif not quiet: - print("# WARNING: file %s skipped due to duplication" % fname) - - - + # patterns to match are 12345-0.txt or pg12345.txt.utf8 + for file in mirror_dir.rglob("[p1-9][g0-9][0-9]*.txt"): + # check that file is not in dups_list + if file not in dups_list: + # avoid files with more "." or "-" than expected + if (file.name.count(".")==1 and file.name.count("-")==1) \ + or (file.name.count(".")==2 and file.name.count("-")==0): + PGnumber = get_PG_number(file) + target = raw_dir / ("PG" + PGnumber + "_raw.txt") + if overwrite or not target.is_file(): + target.hardlink_to(file) + + # if file was not in dupes list and we are not quiet + elif not quiet: + print("# WARNING: file %s skipped due to duplication" % file.name)