diff --git a/README.md b/README.md
index 7bd926f..33d7668 100644
--- a/README.md
+++ b/README.md
@@ -1,22 +1,22 @@
 # Standardized Project Gutenberg Corpus
 Easily generate a local, up-to-date copy of the Standardized Project Gutenberg Corpus (SPGC).
 
-The Standardized Project Gutenberg Corpus was presented in 
+The Standardized Project Gutenberg Corpus was presented in
 
-[A standardized Project Gutenberg corpus for statistical analysis of natural language and quantitative linguistics](https://arxiv.org/abs/1812.08092)  
+[A standardized Project Gutenberg corpus for statistical analysis of natural language and quantitative linguistics](https://arxiv.org/abs/1812.08092)
 M. Gerlach, F. Font-Clos, arXiv:1812.08092, Dec 2018
 
-acompanied by a 'frozen' version of the corpus (SPGC-2018-07-18) as a Zenodo dataset: 
+accompanied by a 'frozen' version of the corpus (SPGC-2018-07-18) as a Zenodo data set:
 
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2422560.svg)](https://doi.org/10.5281/zenodo.2422560)
 
-SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that were part of Project Gutenbergh (PG) as of Jul 18, 2018, matching exactly those used in the paper. Since then, a few more thousands books have been added to PG, so if you want to exactly reproduce the results of the paper, then you should use SPGC-2018-07-18.
+SPGC-2018-07-18 contains the `tokens/` and `counts/` files of all books that were part of Project Gutenberg (PG) as of Jul 18, 2018, matching exactly those used in the paper. Since then, a few more thousands books have been added to PG, so if you want to exactly reproduce the results of the paper, then you should use SPGC-2018-07-18.
 
 For **most other use cases**, however, you probably want the latest, most recent version of the corpus, in which case you should use this repository to **generate the corpus locally** on your computer. In particular, you will need to generate the corpus locally if you need to work with the original full text files in `raw/` and `text/`, since these are not included in the SPGC-2018-07-18 Zenodo dataset.
 
 
 ## Installation
-:warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).  
+:warning: **Python 2.x is not supported** Please make sure your system runs Python 3.x. (https://pythonclock.org/).
 
 Clone this repository
 
diff --git a/get_data.py b/get_data.py
index 5012e68..67a3c4f 100644
--- a/get_data.py
+++ b/get_data.py
@@ -11,9 +11,9 @@
 from src.bookshelves import parse_bookshelves
 
 import argparse
-import os
 import subprocess
 import pickle
+from pathlib import Path
 
 if __name__ == '__main__':
 
@@ -27,21 +27,21 @@
         "-m", "--mirror",
         help="Path to the mirror folder that will be updated via rsync.",
         default='data/.mirror/',
-        type=str)
+        type=Path)
 
     # raw dir
     parser.add_argument(
         "-r", "--raw",
         help="Path to the raw folder.",
         default='data/raw/',
-        type=str)
+        type=Path)
 
     # metadata dir
     parser.add_argument(
         "-M", "--metadata",
         help="Path to the metadata folder.",
         default='metadata/',
-        type=str)
+        type=Path)
 
     # pattern matching
     parser.add_argument(
@@ -62,7 +62,7 @@
         action="store_true",
         help="Overwrite files in raw.")
 
-    # quiet argument, to supress info
+    # quiet argument, to suppress info
     parser.add_argument(
         "-q", "--quiet",
         action="store_true",
@@ -73,12 +73,12 @@
     args = parser.parse_args()
 
     # check that all dirs exist
-    if not os.path.isdir(args.mirror):
-        raise ValueError("The specified mirror directory does not exist.")
-    if not os.path.isdir(args.raw):
-        raise ValueError("The specified raw directory does not exist.")
-    if not os.path.isdir(args.metadata):
-        raise ValueError("The specified metadata directory does not exist.")
+    if not args.mirror.is_dir():
+        raise NotADirectoryError("The specified mirror directory does not exist.")
+    if not args.raw.is_dir():
+        raise NotADirectoryError("The specified raw directory does not exist.")
+    if not args.metadata.is_dir():
+        raise NotADirectoryError("The specified metadata directory does not exist.")
 
     # Update the .mirror directory via rsync
     # --------------------------------------
@@ -117,7 +117,7 @@
     # Populate raw from mirror
     # ------------------------
     # We populate 'raw_dir' hardlinking to
-    # the hidden 'mirror_dir'. Names are standarized
+    # the hidden 'mirror_dir'. Names are standardized
     # into PG12345_raw.txt form.
     populate_raw_from_mirror(
         mirror_dir=args.mirror,
@@ -132,8 +132,8 @@
     # By default, update the whole metadata csv
     # file each time new data is downloaded.
     make_df_metadata(
-        path_xml=os.path.join(args.metadata, 'rdf-files.tar.bz2'),
-        path_out=os.path.join(args.metadata, 'metadata.csv'),
+        path_xml=args.metadata / 'rdf-files.tar.bz2',
+        path_out=args.metadata / 'metadata.csv',
         update=args.keep_rdf
         )
 
@@ -141,7 +141,7 @@
     # -----------
     # Get bookshelves and their respective books and titles as dicts
     BS_dict, BS_num_to_category_str_dict = parse_bookshelves()
-    with open("metadata/bookshelves_ebooks_dict.pkl", 'wb') as fp:
+    with Path("metadata/bookshelves_ebooks_dict.pkl").open('wb') as fp:
         pickle.dump(BS_dict, fp)
-    with open("metadata/bookshelves_categories_dict.pkl", 'wb') as fp:
-        pickle.dump(BS_num_to_category_str_dict, fp)
\ No newline at end of file
+    with Path("metadata/bookshelves_categories_dict.pkl").open('wb') as fp:
+        pickle.dump(BS_num_to_category_str_dict, fp)
diff --git a/process_data.py b/process_data.py
index e21b92f..e1b0d70 100644
--- a/process_data.py
+++ b/process_data.py
@@ -5,12 +5,10 @@
 M. Gerlach and F. Font-Clos
 
 """
-import os
-from os.path import join
 import argparse
-import glob
 import ast
 import pandas as pd
+from pathlib import Path
 
 from src.pipeline import process_book
 from src.utils import get_langs_dict
@@ -26,25 +24,25 @@
         "-r", "--raw",
         help="Path to the raw-folder",
         default='data/raw/',
-        type=str)
+        type=Path)
     # text folder
     parser.add_argument(
         "-ote", "--output_text",
         help="Path to text-output (text_dir)",
         default='data/text/',
-        type=str)
+        type=Path)
     # tokens folder
     parser.add_argument(
         "-oto", "--output_tokens",
         help="Path to tokens-output (tokens_dir)",
         default='data/tokens/',
-        type=str)
+        type=Path)
     # counts folder
     parser.add_argument(
         "-oco", "--output_counts",
         help="Path to counts-output (counts_dir)",
         default='data/counts/',
-        type=str)
+        type=Path)
     # pattern to specify subset of books
     parser.add_argument(
         "-p", "--pattern",
@@ -52,7 +50,7 @@
         default='*',
         type=str)
 
-    # quiet argument, to supress info
+    # quiet argument, to suppress info
     parser.add_argument(
         "-q", "--quiet",
         action="store_true",
@@ -64,19 +62,19 @@
         "-l", "--log_file",
         help="Path to log file",
         default=".log",
-        type=str)
+        type=Path)
 
     # add arguments to parser
     args = parser.parse_args()
 
     # check whether the out-put directories exist
-    if os.path.isdir(args.output_text) is False:
+    if not args.output_text.is_dir():
         raise ValueError("The directory for output of texts '%s' "
                          "does not exist" % (args.output_text))
-    if os.path.isdir(args.output_tokens) is False:
+    if not args.output_tokens.is_dir():
         raise ValueError("The directory for output of tokens '%s' "
                          "does not exist" % (args.output_tokens))
-    if os.path.isdir(args.output_counts) is False:
+    if not args.output_counts.is_dir():
         raise ValueError("The directory for output of counts '%s' "
                          "does not exist" % (args.output_counts))
 
@@ -88,16 +86,16 @@
 
     # loop over all books in the raw-folder
     pbooks = 0
-    for filename in glob.glob(join(args.raw, 'PG%s_raw.txt' % (args.pattern))):
-        # The process_books function will fail very rarely, whne
-        # a file tagged as UTf-8 is not really UTF-8. We kust
+    for file in args.raw.glob('PG%s_raw.txt' % (args.pattern)):
+        # The process_books function will fail very rarely, when
+        # a file tagged as UTF-8 is not really UTF-8. We just
         # skip those books.
         try:
             # get PG_id
-            PG_id = filename.split("/")[-1].split("_")[0]
+            PG_id = file.name.split("_")[0]
 
             # get language from metadata
-            # default is english
+            # default is English
             language = "english"
             # language is a string representing a list of languages codes
             lang_id = ast.literal_eval(metadata.loc[PG_id, "language"])[0]
@@ -106,7 +104,7 @@
 
             # process the book: strip headers, tokenize, count
             process_book(
-                path_to_raw_file=filename,
+                path_to_raw_file=file,
                 text_dir=args.output_text,
                 tokens_dir=args.output_tokens,
                 counts_dir=args.output_counts,
@@ -118,10 +116,10 @@
                 print("Processed %d books..." % pbooks, end="\r")
         except UnicodeDecodeError:
             if not args.quiet:
-                print("# WARNING: cannot process '%s' (encoding not UTF-8)" % filename)
+                print("# WARNING: cannot process '%s' (encoding not UTF-8)" % str(file))
         except KeyError:
             if not args.quiet:
-                print("# WARNING: metadata for '%s' not found" % filename)
+                print("# WARNING: metadata for '%s' not found" % str(file))
         except Exception as e:
             if not args.quiet:
-                print("# WARNING: cannot process '%s' (unkown error)" % filename)
+                print("# WARNING: cannot process '%s' (unkown error)" % str(file))
diff --git a/src/bookshelves.py b/src/bookshelves.py
index 62e31aa..3a72c40 100644
--- a/src/bookshelves.py
+++ b/src/bookshelves.py
@@ -1,10 +1,9 @@
 # -*- coding: utf-8 -*-
 """Functions to download, parse and filter Gutenberg's bookshelves."""
 
-import os
-import glob
-import numpy as np
-import pandas as pd
+from itertools import chain
+from pathlib import Path
+import shutil
 import lxml.html
 import subprocess
 
@@ -19,57 +18,56 @@ def get_bookshelves():
 
     """
     sp_args = ["wget",
-               "--random-wait", "-r", 
-               "-p", "--no-parent", 
-               "-e", "robots=off", 
-               "-U", "mozilla", 
+               "--random-wait", "-r",
+               "-p", "--no-parent",
+               "-e", "robots=off",
+               "-U", "mozilla",
                "https://www.gutenberg.org/ebooks/bookshelf/"
                ]
     subprocess.call(sp_args)
 
     # move it to metadata dir
-    sp_args = "mv www.gutenberg.org/ebooks/bookshelf/* metadata/bookshelves_html/"
-    subprocess.call(sp_args, shell=True)
+    new_dir = Path("metadata/bookshelves_html")
+    Path("www.gutenberg.org/ebooks/bookshelf").rename(new_dir)
 
     # cleanup
-    sp_args = ["rm", "-rf", "www.gutenberg.org"]
-    subprocess.call(sp_args)
+    shutil.rmtree("www.gutenberg.org")
+
     # in the new version of the website and with these parameters of the wget (gets also other links within the crawled page)
     # we get also other files, copy of the bookshelves but with different ordering
     # remove them
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*.opds*"]
-    subprocess.call(sp_args)
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?sort*"]
-    subprocess.call(sp_args)
-    sp_args = ["rm", "-rf", "metadata/bookshelves_html/*?start*"]
-    subprocess.call(sp_args)
+    for file in chain(
+        new_dir.glob("*.opds*"),
+        new_dir.glob("*?sort*"),
+        new_dir.glob("*?start*")
+    ):
+        file.unlink()
+
     return None
 
-def parse_bookshelves():
+def parse_bookshelves(path=Path("metadata/bookshelves_html")):
     """
     Parse the bookshelves html files.
 
-    Builds up a dictionary of bookshelf_category:list(book_ids) and 
+    Builds up a dictionary of bookshelf_category:list(book_ids) and
     a dictionary of bookshelf_category:list(title_category)
     from the individual html files of each bs.
-    
+
     Prints the errors.
     """
     # parse the data
-    BS_paths = glob.glob("metadata/bookshelves_html/*")
-    BS = [path.split("/")[-1] for path in BS_paths]
-
+    (path / ".dummy").unlink() # prevent hidden dummy file to be parsed
     BS_dict = {}
     BS_num_to_category_str_dict = {}
-    for path in BS_paths:
-        bs = path.split("/")[-1]
+    for file in path.iterdir():
+        bs = file.name
         BS_dict[bs] = []
-        with open(path, "r", encoding="UTF-8") as foo:
+        with file.open("r", encoding="UTF-8") as foo:
             dom = lxml.html.fromstring(foo.read())
             # select the url in href for all a tags(links)
             for link in dom.xpath('//a/@href'):
                 # links to ebooks that are not searches
-                if link.find("ebooks") > -1 and link.find("search") == -1:
+                if "ebooks" in link and not "search" in link:
                     PGid = "PG"+link.split("/")[-1]
                     BS_dict[bs].append(PGid)
             # get title of the category
@@ -78,7 +76,7 @@ def parse_bookshelves():
             if len(title_categories) == 0:
                 # debug prints
                 print('No category title')
-                print(path, list(dom), dom.text_content())
+                print(file, list(dom), dom.text_content())
                 title_category = None
             elif len(title_categories) == 1:
                 title_category = title_categories[0].text
diff --git a/src/cleanup.py b/src/cleanup.py
index 69967f6..aa151d6 100644
--- a/src/cleanup.py
+++ b/src/cleanup.py
@@ -1,9 +1,10 @@
 # -*- coding: utf-8 -*-
-"""Taken from https://github.com/c-w/gutenberg/."""
+"""Based on https://github.com/c-w/gutenberg/."""
 
 from __future__ import unicode_literals
 import os
-import io
+
+from src.utils import get_PG_number
 
 
 def cleanup(path, text_dir):
@@ -12,18 +13,14 @@ def cleanup(path, text_dir):
 
     Parameters
     ----------
-    path : string
+    path : pathlib.Path
         Path to the PG****_raw.txt file
 
     """
-    PG_number = path.split("/")[-1].split("_")[0][2:]
-    with io.open(path) as f:
-        text = f.read()
-
-    clean = strip_headers(text)
-    source_file = os.path.join(text_dir, "PG%s_text.txt" % PG_number)
-    with io.open(source_file, "w") as f:
-        f.write(clean)
+    text = path.read_text()
+    PG_number = get_PG_number(path)
+    source_file = text_dir.pathjoin("PG%s_text.txt" % PG_number)
+    source_file.write_text(strip_headers(text))
 
 
 ############
diff --git a/src/metadataparser.py b/src/metadataparser.py
index 16c15eb..df675af 100644
--- a/src/metadataparser.py
+++ b/src/metadataparser.py
@@ -5,13 +5,13 @@
 Based on https://bitbucket.org/c-w/gutenberg/
 """
 
-import os
 import re
 import tarfile
 import urllib
 import urllib.request
 import pandas as pd
 
+from pathlib import Path
 import xml.etree.cElementTree as ElementTree
 try:
     import cPickle as pickle
@@ -19,9 +19,9 @@
     import pickle
 
 # The Python dict produced by this module
-# PICKLEFILE = '../data/metadata/md.pickle.gz'
+# PICKLEFILE = path('../data/metadata/md.pickle.gz')
 # The catalog downloaded from Gutenberg
-RDFFILES = '../data/metadata/rdf-files.tar.bz2'
+RDFFILES = Path('../data/metadata/rdf-files.tar.bz2')
 META_FIELDS = ('id', 'author', 'title', 'downloads', 'formats', 'type', 'LCC',
                'subjects', 'authoryearofbirth', 'authoryearofdeath', 'language'
                )
@@ -40,8 +40,8 @@
     ''', re.IGNORECASE | re.VERBOSE)
 
 
-def make_df_metadata(path_xml='../metadata/rdf-files.tar.bz2',
-                     path_out='../metadata/metadata.csv',
+def make_df_metadata(path_xml=Path('../metadata/rdf-files.tar.bz2'),
+                     path_out=Path('../metadata/metadata.csv'),
                      update=False):
     """
     Write metadata in a csv.
@@ -51,10 +51,10 @@ def make_df_metadata(path_xml='../metadata/rdf-files.tar.bz2',
 
     Parameters
     ----------
-    path_xml : str
+    path_xml : pathlib.Path
         Location of the rdf-file. If it does not exist, we download it from
         http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2
-    path_out : str
+    path_out : pathlib.Path
         Where to save csv-file.
     update : bool
         (False) Download the latest rdf-file even if it already
@@ -111,7 +111,7 @@ def readmetadata(RDFFILES, update=False):
     http://www.gutenberg.org/wiki/Gutenberg:Help_on_Bibliographic_Record_Page
 
     """
-    # if os.path.exists(PICKLEFILE):
+    # if PICKLEFILE.exists():
     #     metadata = pickle.load(gzip.open(PICKLEFILE, 'rb'))
     # else:
     metadata = {}
@@ -136,7 +136,7 @@ def getrdfdata(RDFFILES, update=False):
         An etext meta-data definition.
 
     """
-    if (not os.path.exists(RDFFILES)) or (update is True):
+    if update is True or not RDFFILES.exists():
         # standard location of rdf files
         try:
             RDFURL = "http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2"
@@ -167,7 +167,7 @@ def parsemetadata(ebook):
     result = dict.fromkeys(META_FIELDS)
     # get etext no
     about = ebook.get('{%(rdf)s}about' % NS)
-    result['id'] = int(os.path.basename(about))
+    result['id'] = int(Path(about).name)
     # author
     creator = ebook.find('.//{%(dc)s}creator' % NS)
     if creator is not None:
diff --git a/src/metaquery.py b/src/metaquery.py
index a9df30b..1878c77 100644
--- a/src/metaquery.py
+++ b/src/metaquery.py
@@ -9,12 +9,11 @@
 
 """
 
-import os 
 import pandas as pd
 import numpy as np
 from collections import Counter
+from pathlib import Path
 import re
-import glob
 
 class meta_query(object):
 
@@ -24,11 +23,9 @@ def __init__(self, path='../metadata/metadata.csv', filter_exist=True):
 
         self.df = pd.read_csv(path) ## the dataframe on which we apply filters
         if filter_exist == True: ## filter the books for which we have the data
-            path_text = os.path.abspath(os.path.join(path,os.pardir,os.pardir,'data','text'))
-            list_files = []
-            for file in list(glob.glob( path_text+'/PG*_text.txt' )):
-                list_files += [file]
-            list_ids = sorted([ h.split('/')[-1].split('_text')[0] for h in list_files ])
+            path_text = Path(path).absolute().parents[1] / 'data' / 'text'
+            list_ids = [file.name.split('_text')[0] for file in path_text.glob('PG*_text.txt')]
+
             df = self.df
             df_new = df[df['id'].isin(list_ids)]
             self.df = df_new
@@ -108,11 +105,11 @@ def filter_subject(self,subject_sel,how='only'):
     ### TIME
     def filter_year(self,y_sel,hmin=20):
         '''
-        We filter all books, where 
+        We filter all books, where
         - authoryearofbirth <= y_sel - hmin
         - authoryearofdeath > y_sel
-        Note: 
-        - 1842 books with only authoryearofbirth 
+        Note:
+        - 1842 books with only authoryearofbirth
         - 847 books with only authoryearofdeath
         - 13996 books missing both
         '''
@@ -124,7 +121,7 @@ def filter_year(self,y_sel,hmin=20):
 
     ### AUTHOR
     def filter_author(self,s_sel):
-        s = self.df[ self.df['author'].str.contains(re.escape(s_sel),case=False).replace(np.nan,False)] 
+        s = self.df[ self.df['author'].str.contains(re.escape(s_sel),case=False).replace(np.nan,False)]
         self.df = s
 
     ### Sort by the n most downloaded
diff --git a/src/pipeline.py b/src/pipeline.py
index 5e89c56..c75f41d 100644
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -1,9 +1,8 @@
 # -*- coding: utf-8 -*-
+from src.utils import get_PG_number
 from .cleanup import strip_headers
 from .tokenizer import tokenize_text
 from collections import Counter
-import io
-import os
 
 def process_book(
 	path_to_raw_file=None,
@@ -14,7 +13,7 @@ def process_book(
 	cleanup_f=strip_headers,
     overwrite_all=False,
     language="english",
-    log_file=""
+    log_file=None
 	):
     """
     Process a book, from raw data to counts.
@@ -31,68 +30,68 @@ def process_book(
 
     Overwrite policy
     ----------------
-    By default a book is processed in full except if all the 
+    By default a book is processed in full except if all the
     files already exist (raw,text,tokens and counts). The overwrite_all
-    keyword can cahnge this behaviour.
+    keyword can change this behaviour.
 
     Parameters
     ----------
     overwrite_all : bool
-        If set to True, everything is processed regargless of existing files.
+        If set to True, everything is processed regardless of existing files.
     """
     if text_dir is None:
         raise ValueError("You must specify a path to save the text files.")
-        
+
     if tokens_dir is None:
         raise ValueError("You must specify a path to save the tokens files.")
-        
+
     if counts_dir is None:
         raise ValueError("You must specify a path to save the counts files.")
-        
+
     if path_to_raw_file is None:
         raise ValueError("You must specify a path to the raw file to process.")
-   
-    # get PG number
-    PG_number = path_to_raw_file.split("/")[-1].split("_")[0][2:]
-
-    if overwrite_all or\
-        (not os.path.isfile(os.path.join(text_dir,"PG%s_text.txt"%PG_number))) or \
-        (not os.path.isfile(os.path.join(tokens_dir,"PG%s_tokens.txt"%PG_number))) or \
-        (not os.path.isfile(os.path.join(counts_dir,"PG%s_counts.txt"%PG_number))):
+
+    PG_number = get_PG_number(path_to_raw_file)
+    text_path = text_dir / ("PG%s_text.txt" % PG_number)
+    tokens_path = tokens_dir / ("PG%s_tokens.txt" % PG_number)
+    counts_path = counts_dir / ("PG%s_counts.txt" % PG_number)
+
+    if overwrite_all or not \
+        all(f.is_file() for f in [text_path, tokens_path, counts_path]):
         # read raw file
-        with io.open(path_to_raw_file, encoding="UTF-8") as f:
-            text = f.read()
+        text = path_to_raw_file.read_text(encoding="UTF-8")
 
         # clean it up
         clean = cleanup_f(text)
 
         # write text file
-        target_file = os.path.join(text_dir,"PG%s_text.txt"%PG_number)
-        with io.open(target_file,"w", encoding="UTF-8") as f:
-            f.write(clean)
+        text_path.write_text(clean, encoding="UTF-8")
 
         # compute tokens
         tokens = tokenize_f(clean, language=language)
-   
+
         # write tokens file
-        target_file = os.path.join(tokens_dir,"PG%s_tokens.txt"%PG_number)
-        with io.open(target_file,"w", encoding="UTF-8") as f:
-            f.write("\n".join(tokens)+"\n")
+        tokens_path.write_text("\n".join(tokens)+"\n", encoding="UTF-8")
 
         # compute counts
         counts = Counter(tokens)
-        
+
         # write counts file
-        target_file = os.path.join(counts_dir,"PG%s_counts.txt"%PG_number)
-        with io.open(target_file,"w", encoding="UTF-8") as f:
-            f.write("\n".join([w+"\t"+str(c) for w,c in counts.most_common()])+"\n")
+        counts = "\n".join([w+"\t"+str(c) for w,c in counts.most_common()])+"\n"
+        counts_path.write_text(counts, encoding="UTF-8")
 
         # write log info if log_file is not None
-        if log_file != "":
-            raw_nl = text.count("\n")
-            clean_nl = clean.count("\n")
-            L = len(tokens)
-            V = len(counts)
-            with io.open(log_file, "a") as f:
-               f.write("PG"+str(PG_number)+"\t"+language+"\t"+str(raw_nl)+"\t"+str(clean_nl)+"\t"+str(L)+"\t"+str(V)+"\n")
-                
+        if log_file is None:
+            return
+
+        log_data=[
+            "PG"+PG_number,
+            language,
+            text.count("\n"),
+            clean.count("\n"),
+            len(tokens),
+            len(counts),
+        ]
+        with log_file.open("a") as f:
+           f.write('\t'.join(map(str, log_data))+"\n")
+
diff --git a/src/tokenizer.py b/src/tokenizer.py
index b25f21e..05acefa 100644
--- a/src/tokenizer.py
+++ b/src/tokenizer.py
@@ -13,10 +13,10 @@
 
 def tokenize_text(text, language="english"):
     '''Tokenize a string into a list of tokens.
-    Use NLTK's Treebankwordtokenizer.
+    Use NLTK's TreebankWordTokenizer.
     Note that we first split into sentences using NLTK's sent_tokenize.
     We additionally call a filtering function to remove un-wanted tokens.
-    
+
     IN:
     - text, str
     OUT:
@@ -24,10 +24,10 @@ def tokenize_text(text, language="english"):
     '''
     ## list of tokens
     list_tokens = []
-    
+
     ## split text into sentences
     sentences=sent_tokenize(text, language=language)
-    
+
     ## define the tokenizer
     tokenizer = TreebankWordTokenizer()
     ## loop over all sentences
diff --git a/src/utils.py b/src/utils.py
index ceea2ea..360a756 100644
--- a/src/utils.py
+++ b/src/utils.py
@@ -1,12 +1,12 @@
 # -*- coding: utf-8 -*-
-import os
-import shutil
-import subprocess
-import glob
+
+import re
+
+NUMBER_RE = re.compile(r'\d+')
 
 def get_langs_dict():
     """
-    A dictionary mapping languages codes to full languages names
+    A dictionary mapping language codes to full language names
     """
     langs_dict = {
         "cs": "czech",
@@ -28,33 +28,22 @@ def get_langs_dict():
     }
     return langs_dict
 
-def get_PG_number(string):
+def get_PG_number(file):
     """
     Simply gets the PG number from different possible text files.
     Patterns are: 12345-0.txt or pg12345.txt.utf8
     """
-    # 12345-0.txt
-    if string.find("-0.txt")>-1:
-        PG_number = string.replace("-0.txt","")
-
-    # pg12345.txt.utf8
-    elif string.find(".txt.utf8")>-1:
-            PG_number =  string.replace(".txt.utf8","").replace("pg","")
+    # stem removes only one ending, second is always '.txt'
+    PG_number = NUMBER_RE.search(file.stem)
+    assert PG_number is not None, file + "\n" + PG_number
+    return PG_number.group()
 
-    if not PG_number.isnumeric():
-        print(string)
-        print(PG_number,"\n")
-        assert PG_number.isnumeric()
-    return PG_number
 
-
-def list_duplicates_in_mirror(
-    mirror_dir = None,
-    ):
+def list_duplicates_in_mirror(mirror_dir):
     """
     Look for duplicates in 'mirror_dir', and list them.
     Typical case is, there's two files corresponding to the
-    same PG identificator:
+    same PG identifier:
 
     1) mirror/1/2/3/4/12345/12345-0.txt
     2) mirror/cache/epub/12345/pg12345.txt.utf-8
@@ -62,22 +51,20 @@ def list_duplicates_in_mirror(
     We populate 1) and list 2) as a duplicate
     """
     dups_list = []
-    for dirName, subdirList, fileList in os.walk(mirror_dir):
-        for matchpath in glob.iglob(os.path.join(dirName,"*-0.txt")):
-            fname = matchpath.split("/")[-1]
-            # fname must have exactly one "." and one "-"
-            if (len(fname.split("."))==2 and len(fname.split("-"))==2):
-                PGnumber = get_PG_number(fname)
-                possible_duplicate = os.path.join(mirror_dir,"cache","epub",PGnumber,"pg"+PGnumber+".txt.utf8")
-                if os.path.isfile(possible_duplicate):
-                    dups_list.append(possible_duplicate)
+    for file in mirror_dir.rglob("*-0.txt"):
+        # file.name must have exactly one "." and one "-"
+        if (file.name.count(".") == 1 and file.name.count("-") == 1):
+            PGnumber = get_PG_number(file)
+            possible_duplicate = mirror_dir / "cache" / "epub" / PGnumber / ("pg"+PGnumber+".txt.utf8")
+            if possible_duplicate.is_file():
+                dups_list.append(possible_duplicate)
     return dups_list
 
 
-def populate_raw_from_mirror(mirror_dir=None,
-                             raw_dir=None,
+def populate_raw_from_mirror(mirror_dir,
+                             raw_dir,
+                             dups_list,
                              overwrite=False,
-                             dups_list=None,
                              quiet=False):
     """
     Populate the raw/ directory using the .mirror/ directory.
@@ -90,34 +77,27 @@ def populate_raw_from_mirror(mirror_dir=None,
 
     Parameters
     ----------
+    mirror_dir : pathlib.Path
+    raw_dir : pathlib.Path
     overwrite : bool
         Whether to overwrite files in raw.
-    dups_list :  list of strings
+    dups_list : list of strings
         A list of duplicates produced by list_duplicates_in_mirror.
         Files in this list are not copied into raw.
 
     """
-    for dirName, subdirList, fileList in os.walk(mirror_dir):
-        # patterns to match are 12345-0.txt or pg12345.txt.utf8
-        for matchpath in glob.iglob(os.path.join(dirName, "[p123456789][g0123456789][0-9]*")):
-            fname = matchpath.split("/")[-1]
-            # check that file is not in dups_list
-            if matchpath not in dups_list:
-                # avoid files with more "." or "-" than expected
-                if (len(fname.split("."))==2 and len(fname.split("-"))==2 and fname[-6::]=="-0.txt")\
-                or (len(fname.split("."))==3 and len(fname.split("-"))==1 and fname[-9::]==".txt.utf8"):
-                    # get PG number
-                    PGnumber = get_PG_number(fname)
-
-                    source = os.path.join(dirName, fname)
-                    target = os.path.join(raw_dir, "PG"+PGnumber+"_raw.txt")
-
-                    if (not os.path.isfile(target)) or overwrite:
-                        subprocess.call(["ln", "-f", source, target])
-
-            # if file was not in dupes list and we are not quiet
-            elif not quiet:
-                print("# WARNING: file %s skipped due to duplication" % fname)
-
-
-
+    # patterns to match are 12345-0.txt or pg12345.txt.utf8
+    for file in mirror_dir.rglob("[p1-9][g0-9][0-9]*.txt"):
+        # check that file is not in dups_list
+        if file not in dups_list:
+            # avoid files with more "." or "-" than expected
+            if (file.name.count(".")==1 and file.name.count("-")==1) \
+            or (file.name.count(".")==2 and file.name.count("-")==0):
+                PGnumber = get_PG_number(file)
+                target = raw_dir / ("PG" + PGnumber + "_raw.txt")
+                if overwrite or not target.is_file():
+                    target.hardlink_to(file)
+
+        # if file was not in dupes list and we are not quiet
+        elif not quiet:
+            print("# WARNING: file %s skipped due to duplication" % file.name)