Fixes to paths, Readme flags updated, extra checks added

salesforce · Jun 4, 2021 · b5259ea · b5259ea
1 parent af7f353
commit b5259ea
Show file tree

Hide file tree

Showing 8 changed files with 24 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -75,8 +75,9 @@ Note: At the time of collecting the data, all links in literature_links.tsv were
 For each data source, run `get_works.py` to first fetch the links for each book, and then run `get_summaries.py` to get the summaries from the collected links.
 
 ```
-python scripts/data_collection/cliffnotes/get_works.py
-python scripts/data_collection/cliffnotes/get_summaries.py
+cd scripts/data_collection/cliffnotes/
+python get_works.py
+python get_summaries.py
 ```
 
 #### 3. Data Cleaning
@@ -85,30 +86,31 @@ Data Cleaning is performed through the following steps:
 
 First script for some basic cleaning operations, like removing parentheses, links etc from the summary text
 ```
-python scripts/data_cleaning_scripts/basic_clean.py
+cd scripts/data_cleaning_scripts/
+python basic_clean.py
 ```
 
 We use intermediate alignments in  summary_chapter_matched_all_sources.jsonl to identify which summaries are separable, and separates them, creating new summaries (eg. Chapters 1-3 summary separated into 3 different files - Chapter 1 summary, Chapter 2 summary, Chapter 3 summary)
 ```
-python scripts/data_cleaning_scripts/split_aggregate_chaps_all_sources.py
+python split_aggregate_chaps_all_sources.py
 ```
 
 Lastly, our final cleaning script using various regexes to separate out analysis/commentary text, removes prefixes, suffixes etc.
 ```
-python scripts/data_cleaning_scripts/clean_summaries.py
+python clean_summaries.py
 ```
 
 #### Data Alignments
 Generating paragraph alignments from the chapter-level-summary-alignments, is performed individually for the train/test/val splits:
 
 Gather the data from the summaries and book chapters into a single jsonl
 ```
-python paragraph-level-summary-alignments/gather_data.py
+python paragraph-level-summary-alignments/gather_data.py --split_paragraphs
 ```
 
 Generate alignments of the paragraphs with sentences from the summary using the bi-encoder **paraphrase-distilroberta-base-v1**
 ```
-python paragraph-level-summary-alignments/align_data_bi_encoder_paraphrase.py
+python paragraph-level-summary-alignments/align_data_bi_encoder_paraphrase.py --stable_alignment
 ```
 
 Aggregate the generated alignments for cases where multiple sentences from chapter-summaries are matched to the same paragraph from the book

diff --git a/alignments/paragraph-level-summary-alignments/gather_data.py b/alignments/paragraph-level-summary-alignments/gather_data.py
@@ -221,7 +221,6 @@ def main(args):
     parser.add_argument('--join_strings', action='store_true')
     parser.add_argument('--split_paragraphs', action='store_true')
     parser.add_argument('--matched_file', type=str, required=True)
-    #Split paragraphs does what jloin strings does automatically. Why?
     args = parser.parse_args()
 
     if args.join_strings and args.split_paragraphs or (not args.join_strings and not args.split_paragraphs):

diff --git a/scripts/data_cleaning/basic_clean.py b/scripts/data_cleaning/basic_clean.py
@@ -15,7 +15,7 @@
 import re
 from unidecode import unidecode
 
-sources = ['gradesaver', 'shmoop',  'cliffnotes', 'sparknotes', 'barronbooks', 'bookwolf',  'novelguide', 'thebestnotes']
+sources = ['gradesaver', 'shmoop',  'cliffnotes', 'sparknotes', 'pinkmonkey', 'bookwolf',  'novelguide', 'thebestnotes']
 
 BASE_DIR = "../raw_summaries/"
 
@@ -104,4 +104,4 @@
             book_dest_path = os.path.join(book_dest, section_out)
 
             with open(book_dest_path, 'w') as outfile:
-                json.dump(summary_json, outfile)
+                json.dump(summary_json, outfile)
diff --git a/scripts/data_cleaning/clean_summaries.py b/scripts/data_cleaning/clean_summaries.py
@@ -16,7 +16,7 @@
 from os.path import basename
 
 # We clean one source at a time
-sources = ['gradesaver', 'shmoop',  'cliffnotes', 'sparknotes','barronbooks', 'bookwolf',  'novelguide', 'thebestnotes']
+sources = ['gradesaver', 'shmoop',  'cliffnotes', 'sparknotes','pinkmonkey', 'bookwolf',  'novelguide', 'thebestnotes']
 
 for ix, source in tqdm(enumerate(sources)):
 
@@ -251,4 +251,4 @@ def remove_chapter_prefixes(line):
             with open(section_path, "w") as fout:
                 json.dump(new_json_dict, fout)
 
-    print ("book_count: ", book_count)
+    print ("book_count: ", book_count)
diff --git a/scripts/data_cleaning/split_aggregate_chaps_all_sources.py b/scripts/data_cleaning/split_aggregate_chaps_all_sources.py
@@ -392,7 +392,11 @@ def get_summary_files_chapter_count(x):
         if (book_unique_id != prev_book_unique_id) and prev_book_unique_id != "":
             summaries_counted = 0
 
-        summary_path =  x['summary_path']
+        summary_path =  os.path.join("../", x['summary_path'])
+
+        if not os.path.exists(summary_path):
+            # Summary directory missing
+            continue
 
         num_toc_lines, summary_dir_count = get_summary_files_chapter_count(x)
 

diff --git a/scripts/data_collection/cliffnotes/get_summaries.py b/scripts/data_collection/cliffnotes/get_summaries.py
@@ -108,6 +108,7 @@ def scrape_section_continuation(parent_soup, section_header):
 
             print (section, e)
             errors_file.write(section + "\t" + str(e) + "\n")
+            continue
 
         section_text = "<PARAGRAPH>".join(section_paragraphs).replace("Continued on next page...", "")
 

diff --git a/scripts/data_collection/sparknotes/get_summaries.py b/scripts/data_collection/sparknotes/get_summaries.py
@@ -74,12 +74,12 @@ def wrap_data(name, summary, analysis, url):
 
     try:
         soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
+        summary_section = soup.find("span", {"id": "Summary"}).find_parent("div")
     except Exception as e:
         print (page_url, e)
         f_errors.write(title + "\t" + page_url + "\t" + str(e) + "\n")
         continue
 
-    summary_section = soup.find("span", {"id": "Summary"}).find_parent("div")
     summary_links = summary_section.findAll("a")
     summary_links = [link.get("href") for link in summary_links if "section" in link.get("href")]
 
@@ -125,6 +125,9 @@ def wrap_data(name, summary, analysis, url):
 
             section_paragraphs.append(subsection_data.text.strip().replace("\n", " "))
 
+        if section_paragraphs == []:
+            continue
+
         section_text = "<PARAGRAPH>".join(section_paragraphs)
 
         if "Summary:" in section_text and "Analysis:" in section_text:

diff --git a/scripts/data_collection/sparknotes/get_works.py b/scripts/data_collection/sparknotes/get_works.py
@@ -30,7 +30,7 @@ def scrape_index_pages(seed_page):
     except Exception as e:
         print ("Skipping: ", seed_page)
         errors_file.write(seed_page + "\t" + str(e) + "\n")
-        continue
+        return []
 
     items = soup.findAll("li", {"class": "hub-AZ-list__card hub-AZ-list__card--byTitle"})
     print("Found %d items." % len(items))