Adding extra error handling to scripts, minor fixes

salesforce · May 26, 2021 · 0bdfa7c · 0bdfa7c
1 parent 6220df7
commit 0bdfa7c
Show file tree

Hide file tree

Showing 13 changed files with 199 additions and 90 deletions.
diff --git a/scripts/data_collection/bookwolf/get_summaries.py b/scripts/data_collection/bookwolf/get_summaries.py
@@ -7,6 +7,10 @@
  */
 """
 
+
+"""
+Note: Summaries collected through bookwolf require significant manual cleanup owing to the way the HTML is written
+"""
 from builtins import zip, str, range
 
 import pdb, os, csv, re, io, json
@@ -37,6 +41,9 @@
 def get_overview_paragraphs(overview_links, specific_summary_dir):
 
     for index, (overview, name) in enumerate(overview_links):
+
+        print (name, overview)
+
         try:
             soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
             overview_data = soup.find("td", {"class": "TextObject"})
@@ -53,8 +60,8 @@ def get_overview_paragraphs(overview_links, specific_summary_dir):
                 overview_paragraphs = [unidecode(paragraph.text.strip()) for paragraph in overview_data.findAll("p", recursive=False)[1:]]
 
             except Exception as e:
-                print("No book summary for: ", e)
-                f_errors.write(str(index) + "\t" + overview + "\t" + name + "\t" + specific_summary_dir + "\n")
+                print("No book summary for: ", overview, e)
+                f_errors.write(overview + "\t" + name + "\t" + specific_summary_dir + "\n")
                 continue
 
             overview_text = "\n".join(overview_paragraphs)
@@ -73,7 +80,7 @@ def get_section_paragraphs(section_links, specific_summary_dir):
 
         try:
 
-            print ("Section: ", section)
+            print (name, section)
             soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
             section_data = soup.find("td", {"class": "TextObject"})
         except Exception as e:
@@ -85,8 +92,8 @@ def get_section_paragraphs(section_links, specific_summary_dir):
                 soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
                 section_data = soup.find("td", {"class": "TextObject"})
             except Exception as e:
-                print ("Chapter level summary not found: ", e)
-                f_errors.write(str(index) + "\t" + section + "\t" + name + "\t" + specific_summary_dir + "\n")
+                print ("Chapter level summary not found for: ", section, e)
+                f_errors.write(section + "\t" + name + "\t" + specific_summary_dir + "\n")
                 continue
 
 
@@ -113,8 +120,6 @@ def get_section_paragraphs(section_links, specific_summary_dir):
             else:
                 section_analysis.append(unidecode(paragraph.text.strip()))
 
-            # print ("line: ", paragraph.text.strip())
-
         section_text = "\n".join(section_paragraphs)
         section_interpretation = "\n".join(section_analysis)
 
@@ -129,7 +134,6 @@ def get_section_paragraphs(section_links, specific_summary_dir):
 
 
 # For each summary info
-error_files, error_titles = [], []
 for k, (title, page_url) in enumerate(summary_infos):
 
     print('\n>>> {}. {} <<<'.format(k, title))
@@ -140,9 +144,17 @@ def get_section_paragraphs(section_links, specific_summary_dir):
         os.makedirs(specific_summary_dir)
     else:
         print("Found existing directory.")
+        continue
 
     # Parse page
-    soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
+    print ("page_url: ", page_url)
+    try:
+        soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
+    except Exception as e:
+        print (page_url, e)
+        f_errors.write(str(k) + "\t" + title + "\t" + page_url + "\t" + specific_summary_dir + "\n")
+        continue
+
 
     # Parse general summary
     navigation_links = soup.find("table", {"id": "Table56"})
@@ -151,16 +163,16 @@ def get_section_paragraphs(section_links, specific_summary_dir):
     overview_links = [(urllib.parse.urljoin(MAIN_SITE, link.get("href")), link.text) for link in navigation_links.findAll("a")\
      if ("part" not in link.text.lower() and ("context" in link.get("href") or "summary" in link.get("href") or "synopsis" in link.get("href") ))]
 
-    #Filter out some of the links that are obviously not chapter summary links
-    #Since this source only has a handful of books, it was easy to hard code which links to fetch/not fetch
+    # Filter out some of the links that are obviously not chapter summary links
+    # Since this source only has a handful of books, it was easy to hard code which links to fetch summaries from
     section_links = [(urllib.parse.urljoin(MAIN_SITE, link.get("href")), link.text) for link in navigation_links.findAll("a") \
-    if  (("interpretation" not in link.text.lower() and "comment" not in link.text.lower() and "author" not in link.text.lower()\
+    if  ("interpretation" not in link.text.lower() and "comment" not in link.text.lower() and "author" not in link.text.lower()\
     and "character" not in link.text.lower() and "questions" not in link.text.lower() and "life at the time" not in link.text.lower()\
     and "theme" not in link.text.lower() and "foreword" not in link.text.lower() and "background" not in link.text.lower()\
     and "symbolism" not in link.text.lower() and "introduction" not in link.text.lower() and "characterization" not in link.text.lower()\
-    and "setting" not in link.text.lower() and "family life" not in link.text.lower() and "comment" not in link.text.lower() ) 
+    and "setting" not in link.text.lower() and "family life" not in link.text.lower() and "comment" not in link.text.lower() ) ]
 
-    print ("overview_links: ", overview_links)
+    print ("overview_link: ", overview_links)
     print ("section_links: ", section_links)
 
     if len(overview_links) != 0:

diff --git a/scripts/data_collection/bookwolf/get_works.py b/scripts/data_collection/bookwolf/get_works.py
@@ -17,7 +17,7 @@
 from nltk.tokenize import word_tokenize, sent_tokenize
 
 # PARAMS
-MAIN_SITE = 'https://web.archive.org/web/20210120012015/http://www.bookwolf.com/'
+MAIN_SITE = 'https://web.archive.org/'
 SEED_URL = 'https://web.archive.org/web/20210120012015/http://www.bookwolf.com/Welcome_to_Bookwolf1/welcome_to_bookwolf1.html'
 
 def scrape_index_pages(seed_page):
@@ -33,12 +33,12 @@ def scrape_index_pages(seed_page):
     # # Go over each section
     for index, item in enumerate(books_table):
         # Parse section to get bullet point text
-        print (item)
         item_title = item.find("a").text
-        item_url = item.find("a").get("href")[3:]
+        item_url = item.find("a").get("href")
 
+        print (index)
         print ("item_title: ", item_title)
-        print ("item_url: ", item_url)
+        print ("item_url: ", item_url, "\n")
 
         scraped_links.append({
             "title": item_title.strip().replace(",",""),

diff --git a/scripts/data_collection/gradesaver/get_summaries.py b/scripts/data_collection/gradesaver/get_summaries.py
@@ -24,13 +24,14 @@
 # Summary list info
 summary_list_file = "literature_links.tsv"
 
+errors_file = open("section_errors.txt","w")
+
 # Get contents of the summary file
 with open(summary_list_file, 'r') as tsvfile:
     reader = csv.reader(tsvfile, delimiter='\t')
     summary_infos = list(reader)
 
 # For each summary info
-error_files, error_titles = [], []
 for k, (title, page_url) in enumerate(summary_infos):
     print('\n>>> {}. {} <<<'.format(k, title))
 
@@ -40,21 +41,28 @@
         os.makedirs(specific_summary_dir)
     else:
         print("Found existing directory, skipping.")
+        continue
 
     # Parse page
-    soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
+    try:
+        soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
+    except Exception as e:
+        print (page_url, e)
+        errors_file.write(page_url + "\t" + str(e))
+        continue
+
 
     # # Parse general summary
     navigation_links = soup.find("ul", {"class": "navSection__list js--collapsible"})
     overview_links = [(urllib.parse.urljoin(MAIN_SITE, link.find("a").get("href")), link.text.strip()) for link in navigation_links.findAll("li") if link.text.strip() == title + " Summary"]
-    print (overview_links)
+    # print (overview_links)
 
     if len(overview_links) == 0:
         print ("No overview summaries found")
     else:
         for index, (overview, name) in enumerate(overview_links):
             try:
-                print (overview)
+                print (name, overview)
                 soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
                 overview_data = soup.find("article", {"class": "section__article"})
 
@@ -101,11 +109,11 @@
         print ("No section summaries found")
     else:
         section_links = [(urllib.parse.urljoin(MAIN_SITE,link.find("a").get("href")), link.text.strip()) for link in section_links[0]]
-        print (section_links)
+        # print (section_links)
 
         for index, (section, name) in enumerate(section_links):
             try:
-                print (section)
+                print (name, section)
                 soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
                 section_data = soup.find("article", {"class": "section__article"})
 

diff --git a/scripts/data_collection/novelguide/get_summaries.py b/scripts/data_collection/novelguide/get_summaries.py
@@ -25,7 +25,7 @@
 ARGS = PARSER.parse_args()
 
 # PARAMS
-SUMMARY_DIR = '../../raw_summaries/gradesaver/summaries'
+SUMMARY_DIR = '../../raw_summaries/novelguide/summaries'
 MAIN_SITE = 'https://web.archive.org/web/20210225014436/https://www.novelguide.com/'
 
 def hasNumbers(inputString):
@@ -36,6 +36,9 @@ def get_section_level_data(section_links):
     http_errors = []
 
     for index, (section, name), specific_summary_dir in section_links:
+
+        print (name, section)
+
         try:
             soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
             section_data = soup.find("div", {"class": "content clear-block"})
@@ -127,9 +130,11 @@ def get_section_level_data(section_links):
     summary_infos = list(reader)
 
 
-#Create the errors file every time when starting to scrape the summaries
+# Create the errors file every time when starting to scrape the summaries
+# This file can be used to try and rescrape the links that resulted in an error
 f_errors = open("section_errors.txt","w")
-print ("Errors file created")
+
+f_book_errors = open("book_errors.txt","w")
 
 # For each summary info
 for k, (title, page_url) in enumerate(summary_infos):
@@ -144,6 +149,7 @@ def get_section_level_data(section_links):
         os.makedirs(specific_summary_dir)
     else:
         print("Found existing directory.")
+        continue
 
     # Parse page
     try:
@@ -154,15 +160,21 @@ def get_section_level_data(section_links):
         try:
             soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
         except urllib.error.HTTPError:
-            #Page not accessible at the moment
-            with open("book_not_found.txt","a") as f:
-                f.write(k, title, page_url)
-                f.write("\n")
+            print ("Page not accessible")
+            f_book_errors.write(str(k) + "\t" + title + "\t" + page_url)
+            f_book_errors.write("\n")
             continue
 
     # # Parse general summary
     navigation_links = soup.find("div", {"id": "block-booknavigation-3"})
-    # print (navigation_links)
+
+    #  Some links are just empty webpages
+    if navigation_links == None:
+        print ("Page not accessible")
+        f_book_errors.write(str(k) + "\t" + title + "\t" + page_url)
+        f_book_errors.write("\n")
+        continue
+
     section_links = [(urllib.parse.urljoin(MAIN_SITE, link.find("a").get("href")), link.text.strip()) for link in navigation_links.findAll("li")\
      if 'chapter' in link.text.strip().lower() or 'summary' in link.text.strip().lower() or 'section' in link.text.strip().lower() or 'stave' in link.text.strip().lower() \
      or 'chp' in link.text.strip().lower() or 'scene' in link.text.strip().lower() or 'act ' in link.text.strip().lower() \
@@ -176,7 +188,7 @@ def get_section_level_data(section_links):
     for index, (section, name) in enumerate(section_links):
         section_links_with_index.append((index,(section, name), specific_summary_dir))
 
-    print (section_links_with_index, "\n")
+    # print (section_links_with_index, "\n")
 
 
     if len(section_links_with_index) == 0:

diff --git a/scripts/data_collection/novelguide/get_works.py b/scripts/data_collection/novelguide/get_works.py
@@ -18,7 +18,7 @@
 # PARAMS
 MAIN_SITE = 'https://web.archive.org/web/20210225014436/https://www.novelguide.com/'
 
-alphabet_list = string.ascii_lowercase
+alphabet_list = string.ascii_lowercase + '1'
 
 SEED_URL = 'https://web.archive.org/web/20210225014436/https://www.novelguide.com/title/'
 
@@ -28,7 +28,7 @@ def scrape_index_pages(seed_page):
     scraped_links = []
 
     for char in alphabet_list:
-        books_page = seed_page + '1'
+        books_page = seed_page + char
 
         page_no = 0
 
@@ -60,13 +60,11 @@ def scrape_index_pages(seed_page):
 
             print ("No books found with title: ", char)
 
-        break
-
     return scraped_links
 
 # generate literature links
 scraped_data = scrape_index_pages(SEED_URL)
 
-with open("literature_links.tsv", "a") as fd:
+with open("literature_links.tsv", "w") as fd:
     for data in scraped_data:
         fd.write("%s\t%s\n" % (data["title"], data["url"]))