Skip to content

Commit

Permalink
Adding extra error handling to scripts, minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jigsaw2212 committed May 26, 2021
1 parent 6220df7 commit 0bdfa7c
Show file tree
Hide file tree
Showing 13 changed files with 199 additions and 90 deletions.
40 changes: 26 additions & 14 deletions scripts/data_collection/bookwolf/get_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
*/
"""


"""
Note: Summaries collected through bookwolf require significant manual cleanup owing to the way the HTML is written
"""
from builtins import zip, str, range

import pdb, os, csv, re, io, json
Expand Down Expand Up @@ -37,6 +41,9 @@
def get_overview_paragraphs(overview_links, specific_summary_dir):

for index, (overview, name) in enumerate(overview_links):

print (name, overview)

try:
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
overview_data = soup.find("td", {"class": "TextObject"})
Expand All @@ -53,8 +60,8 @@ def get_overview_paragraphs(overview_links, specific_summary_dir):
overview_paragraphs = [unidecode(paragraph.text.strip()) for paragraph in overview_data.findAll("p", recursive=False)[1:]]

except Exception as e:
print("No book summary for: ", e)
f_errors.write(str(index) + "\t" + overview + "\t" + name + "\t" + specific_summary_dir + "\n")
print("No book summary for: ", overview, e)
f_errors.write(overview + "\t" + name + "\t" + specific_summary_dir + "\n")
continue

overview_text = "\n".join(overview_paragraphs)
Expand All @@ -73,7 +80,7 @@ def get_section_paragraphs(section_links, specific_summary_dir):

try:

print ("Section: ", section)
print (name, section)
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
section_data = soup.find("td", {"class": "TextObject"})
except Exception as e:
Expand All @@ -85,8 +92,8 @@ def get_section_paragraphs(section_links, specific_summary_dir):
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
section_data = soup.find("td", {"class": "TextObject"})
except Exception as e:
print ("Chapter level summary not found: ", e)
f_errors.write(str(index) + "\t" + section + "\t" + name + "\t" + specific_summary_dir + "\n")
print ("Chapter level summary not found for: ", section, e)
f_errors.write(section + "\t" + name + "\t" + specific_summary_dir + "\n")
continue


Expand All @@ -113,8 +120,6 @@ def get_section_paragraphs(section_links, specific_summary_dir):
else:
section_analysis.append(unidecode(paragraph.text.strip()))

# print ("line: ", paragraph.text.strip())

section_text = "\n".join(section_paragraphs)
section_interpretation = "\n".join(section_analysis)

Expand All @@ -129,7 +134,6 @@ def get_section_paragraphs(section_links, specific_summary_dir):


# For each summary info
error_files, error_titles = [], []
for k, (title, page_url) in enumerate(summary_infos):

print('\n>>> {}. {} <<<'.format(k, title))
Expand All @@ -140,9 +144,17 @@ def get_section_paragraphs(section_links, specific_summary_dir):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory.")
continue

# Parse page
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
print ("page_url: ", page_url)
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print (page_url, e)
f_errors.write(str(k) + "\t" + title + "\t" + page_url + "\t" + specific_summary_dir + "\n")
continue


# Parse general summary
navigation_links = soup.find("table", {"id": "Table56"})
Expand All @@ -151,16 +163,16 @@ def get_section_paragraphs(section_links, specific_summary_dir):
overview_links = [(urllib.parse.urljoin(MAIN_SITE, link.get("href")), link.text) for link in navigation_links.findAll("a")\
if ("part" not in link.text.lower() and ("context" in link.get("href") or "summary" in link.get("href") or "synopsis" in link.get("href") ))]

#Filter out some of the links that are obviously not chapter summary links
#Since this source only has a handful of books, it was easy to hard code which links to fetch/not fetch
# Filter out some of the links that are obviously not chapter summary links
# Since this source only has a handful of books, it was easy to hard code which links to fetch summaries from
section_links = [(urllib.parse.urljoin(MAIN_SITE, link.get("href")), link.text) for link in navigation_links.findAll("a") \
if (("interpretation" not in link.text.lower() and "comment" not in link.text.lower() and "author" not in link.text.lower()\
if ("interpretation" not in link.text.lower() and "comment" not in link.text.lower() and "author" not in link.text.lower()\
and "character" not in link.text.lower() and "questions" not in link.text.lower() and "life at the time" not in link.text.lower()\
and "theme" not in link.text.lower() and "foreword" not in link.text.lower() and "background" not in link.text.lower()\
and "symbolism" not in link.text.lower() and "introduction" not in link.text.lower() and "characterization" not in link.text.lower()\
and "setting" not in link.text.lower() and "family life" not in link.text.lower() and "comment" not in link.text.lower() )
and "setting" not in link.text.lower() and "family life" not in link.text.lower() and "comment" not in link.text.lower() ) ]

print ("overview_links: ", overview_links)
print ("overview_link: ", overview_links)
print ("section_links: ", section_links)

if len(overview_links) != 0:
Expand Down
8 changes: 4 additions & 4 deletions scripts/data_collection/bookwolf/get_works.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from nltk.tokenize import word_tokenize, sent_tokenize

# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210120012015/http://www.bookwolf.com/'
MAIN_SITE = 'https://web.archive.org/'
SEED_URL = 'https://web.archive.org/web/20210120012015/http://www.bookwolf.com/Welcome_to_Bookwolf1/welcome_to_bookwolf1.html'

def scrape_index_pages(seed_page):
Expand All @@ -33,12 +33,12 @@ def scrape_index_pages(seed_page):
# # Go over each section
for index, item in enumerate(books_table):
# Parse section to get bullet point text
print (item)
item_title = item.find("a").text
item_url = item.find("a").get("href")[3:]
item_url = item.find("a").get("href")

print (index)
print ("item_title: ", item_title)
print ("item_url: ", item_url)
print ("item_url: ", item_url, "\n")

scraped_links.append({
"title": item_title.strip().replace(",",""),
Expand Down
20 changes: 14 additions & 6 deletions scripts/data_collection/gradesaver/get_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,14 @@
# Summary list info
summary_list_file = "literature_links.tsv"

errors_file = open("section_errors.txt","w")

# Get contents of the summary file
with open(summary_list_file, 'r') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
summary_infos = list(reader)

# For each summary info
error_files, error_titles = [], []
for k, (title, page_url) in enumerate(summary_infos):
print('\n>>> {}. {} <<<'.format(k, title))

Expand All @@ -40,21 +41,28 @@
os.makedirs(specific_summary_dir)
else:
print("Found existing directory, skipping.")
continue

# Parse page
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except Exception as e:
print (page_url, e)
errors_file.write(page_url + "\t" + str(e))
continue


# # Parse general summary
navigation_links = soup.find("ul", {"class": "navSection__list js--collapsible"})
overview_links = [(urllib.parse.urljoin(MAIN_SITE, link.find("a").get("href")), link.text.strip()) for link in navigation_links.findAll("li") if link.text.strip() == title + " Summary"]
print (overview_links)
# print (overview_links)

if len(overview_links) == 0:
print ("No overview summaries found")
else:
for index, (overview, name) in enumerate(overview_links):
try:
print (overview)
print (name, overview)
soup = BeautifulSoup(urllib.request.urlopen(overview), "html.parser")
overview_data = soup.find("article", {"class": "section__article"})

Expand Down Expand Up @@ -101,11 +109,11 @@
print ("No section summaries found")
else:
section_links = [(urllib.parse.urljoin(MAIN_SITE,link.find("a").get("href")), link.text.strip()) for link in section_links[0]]
print (section_links)
# print (section_links)

for index, (section, name) in enumerate(section_links):
try:
print (section)
print (name, section)
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
section_data = soup.find("article", {"class": "section__article"})

Expand Down
30 changes: 21 additions & 9 deletions scripts/data_collection/novelguide/get_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
ARGS = PARSER.parse_args()

# PARAMS
SUMMARY_DIR = '../../raw_summaries/gradesaver/summaries'
SUMMARY_DIR = '../../raw_summaries/novelguide/summaries'
MAIN_SITE = 'https://web.archive.org/web/20210225014436/https://www.novelguide.com/'

def hasNumbers(inputString):
Expand All @@ -36,6 +36,9 @@ def get_section_level_data(section_links):
http_errors = []

for index, (section, name), specific_summary_dir in section_links:

print (name, section)

try:
soup = BeautifulSoup(urllib.request.urlopen(section), "html.parser")
section_data = soup.find("div", {"class": "content clear-block"})
Expand Down Expand Up @@ -127,9 +130,11 @@ def get_section_level_data(section_links):
summary_infos = list(reader)


#Create the errors file every time when starting to scrape the summaries
# Create the errors file every time when starting to scrape the summaries
# This file can be used to try and rescrape the links that resulted in an error
f_errors = open("section_errors.txt","w")
print ("Errors file created")

f_book_errors = open("book_errors.txt","w")

# For each summary info
for k, (title, page_url) in enumerate(summary_infos):
Expand All @@ -144,6 +149,7 @@ def get_section_level_data(section_links):
os.makedirs(specific_summary_dir)
else:
print("Found existing directory.")
continue

# Parse page
try:
Expand All @@ -154,15 +160,21 @@ def get_section_level_data(section_links):
try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
except urllib.error.HTTPError:
#Page not accessible at the moment
with open("book_not_found.txt","a") as f:
f.write(k, title, page_url)
f.write("\n")
print ("Page not accessible")
f_book_errors.write(str(k) + "\t" + title + "\t" + page_url)
f_book_errors.write("\n")
continue

# # Parse general summary
navigation_links = soup.find("div", {"id": "block-booknavigation-3"})
# print (navigation_links)

# Some links are just empty webpages
if navigation_links == None:
print ("Page not accessible")
f_book_errors.write(str(k) + "\t" + title + "\t" + page_url)
f_book_errors.write("\n")
continue

section_links = [(urllib.parse.urljoin(MAIN_SITE, link.find("a").get("href")), link.text.strip()) for link in navigation_links.findAll("li")\
if 'chapter' in link.text.strip().lower() or 'summary' in link.text.strip().lower() or 'section' in link.text.strip().lower() or 'stave' in link.text.strip().lower() \
or 'chp' in link.text.strip().lower() or 'scene' in link.text.strip().lower() or 'act ' in link.text.strip().lower() \
Expand All @@ -176,7 +188,7 @@ def get_section_level_data(section_links):
for index, (section, name) in enumerate(section_links):
section_links_with_index.append((index,(section, name), specific_summary_dir))

print (section_links_with_index, "\n")
# print (section_links_with_index, "\n")


if len(section_links_with_index) == 0:
Expand Down
8 changes: 3 additions & 5 deletions scripts/data_collection/novelguide/get_works.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# PARAMS
MAIN_SITE = 'https://web.archive.org/web/20210225014436/https://www.novelguide.com/'

alphabet_list = string.ascii_lowercase
alphabet_list = string.ascii_lowercase + '1'

SEED_URL = 'https://web.archive.org/web/20210225014436/https://www.novelguide.com/title/'

Expand All @@ -28,7 +28,7 @@ def scrape_index_pages(seed_page):
scraped_links = []

for char in alphabet_list:
books_page = seed_page + '1'
books_page = seed_page + char

page_no = 0

Expand Down Expand Up @@ -60,13 +60,11 @@ def scrape_index_pages(seed_page):

print ("No books found with title: ", char)

break

return scraped_links

# generate literature links
scraped_data = scrape_index_pages(SEED_URL)

with open("literature_links.tsv", "a") as fd:
with open("literature_links.tsv", "w") as fd:
for data in scraped_data:
fd.write("%s\t%s\n" % (data["title"], data["url"]))
Loading

0 comments on commit 0bdfa7c

Please sign in to comment.