Skip to content

Commit

Permalink
Fixes to paths, Readme flags updated, extra checks added
Browse files Browse the repository at this point in the history
  • Loading branch information
jigsaw2212 committed Jun 4, 2021
1 parent af7f353 commit b5259ea
Show file tree
Hide file tree
Showing 8 changed files with 24 additions and 15 deletions.
16 changes: 9 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,9 @@ Note: At the time of collecting the data, all links in literature_links.tsv were
For each data source, run `get_works.py` to first fetch the links for each book, and then run `get_summaries.py` to get the summaries from the collected links.

```
python scripts/data_collection/cliffnotes/get_works.py
python scripts/data_collection/cliffnotes/get_summaries.py
cd scripts/data_collection/cliffnotes/
python get_works.py
python get_summaries.py
```

#### 3. Data Cleaning
Expand All @@ -85,30 +86,31 @@ Data Cleaning is performed through the following steps:

First script for some basic cleaning operations, like removing parentheses, links etc from the summary text
```
python scripts/data_cleaning_scripts/basic_clean.py
cd scripts/data_cleaning_scripts/
python basic_clean.py
```

We use intermediate alignments in summary_chapter_matched_all_sources.jsonl to identify which summaries are separable, and separates them, creating new summaries (eg. Chapters 1-3 summary separated into 3 different files - Chapter 1 summary, Chapter 2 summary, Chapter 3 summary)
```
python scripts/data_cleaning_scripts/split_aggregate_chaps_all_sources.py
python split_aggregate_chaps_all_sources.py
```

Lastly, our final cleaning script using various regexes to separate out analysis/commentary text, removes prefixes, suffixes etc.
```
python scripts/data_cleaning_scripts/clean_summaries.py
python clean_summaries.py
```

#### Data Alignments
Generating paragraph alignments from the chapter-level-summary-alignments, is performed individually for the train/test/val splits:

Gather the data from the summaries and book chapters into a single jsonl
```
python paragraph-level-summary-alignments/gather_data.py
python paragraph-level-summary-alignments/gather_data.py --split_paragraphs
```

Generate alignments of the paragraphs with sentences from the summary using the bi-encoder **paraphrase-distilroberta-base-v1**
```
python paragraph-level-summary-alignments/align_data_bi_encoder_paraphrase.py
python paragraph-level-summary-alignments/align_data_bi_encoder_paraphrase.py --stable_alignment
```

Aggregate the generated alignments for cases where multiple sentences from chapter-summaries are matched to the same paragraph from the book
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@ def main(args):
parser.add_argument('--join_strings', action='store_true')
parser.add_argument('--split_paragraphs', action='store_true')
parser.add_argument('--matched_file', type=str, required=True)
#Split paragraphs does what jloin strings does automatically. Why?
args = parser.parse_args()

if args.join_strings and args.split_paragraphs or (not args.join_strings and not args.split_paragraphs):
Expand Down
4 changes: 2 additions & 2 deletions scripts/data_cleaning/basic_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import re
from unidecode import unidecode

sources = ['gradesaver', 'shmoop', 'cliffnotes', 'sparknotes', 'barronbooks', 'bookwolf', 'novelguide', 'thebestnotes']
sources = ['gradesaver', 'shmoop', 'cliffnotes', 'sparknotes', 'pinkmonkey', 'bookwolf', 'novelguide', 'thebestnotes']

BASE_DIR = "../raw_summaries/"

Expand Down Expand Up @@ -104,4 +104,4 @@
book_dest_path = os.path.join(book_dest, section_out)

with open(book_dest_path, 'w') as outfile:
json.dump(summary_json, outfile)
json.dump(summary_json, outfile)
4 changes: 2 additions & 2 deletions scripts/data_cleaning/clean_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from os.path import basename

# We clean one source at a time
sources = ['gradesaver', 'shmoop', 'cliffnotes', 'sparknotes','barronbooks', 'bookwolf', 'novelguide', 'thebestnotes']
sources = ['gradesaver', 'shmoop', 'cliffnotes', 'sparknotes','pinkmonkey', 'bookwolf', 'novelguide', 'thebestnotes']

for ix, source in tqdm(enumerate(sources)):

Expand Down Expand Up @@ -251,4 +251,4 @@ def remove_chapter_prefixes(line):
with open(section_path, "w") as fout:
json.dump(new_json_dict, fout)

print ("book_count: ", book_count)
print ("book_count: ", book_count)
6 changes: 5 additions & 1 deletion scripts/data_cleaning/split_aggregate_chaps_all_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,11 @@ def get_summary_files_chapter_count(x):
if (book_unique_id != prev_book_unique_id) and prev_book_unique_id != "":
summaries_counted = 0

summary_path = x['summary_path']
summary_path = os.path.join("../", x['summary_path'])

if not os.path.exists(summary_path):
# Summary directory missing
continue

num_toc_lines, summary_dir_count = get_summary_files_chapter_count(x)

Expand Down
1 change: 1 addition & 0 deletions scripts/data_collection/cliffnotes/get_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def scrape_section_continuation(parent_soup, section_header):

print (section, e)
errors_file.write(section + "\t" + str(e) + "\n")
continue

section_text = "<PARAGRAPH>".join(section_paragraphs).replace("Continued on next page...", "")

Expand Down
5 changes: 4 additions & 1 deletion scripts/data_collection/sparknotes/get_summaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ def wrap_data(name, summary, analysis, url):

try:
soup = BeautifulSoup(urllib.request.urlopen(page_url), "html.parser")
summary_section = soup.find("span", {"id": "Summary"}).find_parent("div")
except Exception as e:
print (page_url, e)
f_errors.write(title + "\t" + page_url + "\t" + str(e) + "\n")
continue

summary_section = soup.find("span", {"id": "Summary"}).find_parent("div")
summary_links = summary_section.findAll("a")
summary_links = [link.get("href") for link in summary_links if "section" in link.get("href")]

Expand Down Expand Up @@ -125,6 +125,9 @@ def wrap_data(name, summary, analysis, url):

section_paragraphs.append(subsection_data.text.strip().replace("\n", " "))

if section_paragraphs == []:
continue

section_text = "<PARAGRAPH>".join(section_paragraphs)

if "Summary:" in section_text and "Analysis:" in section_text:
Expand Down
2 changes: 1 addition & 1 deletion scripts/data_collection/sparknotes/get_works.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def scrape_index_pages(seed_page):
except Exception as e:
print ("Skipping: ", seed_page)
errors_file.write(seed_page + "\t" + str(e) + "\n")
continue
return []

items = soup.findAll("li", {"class": "hub-AZ-list__card hub-AZ-list__card--byTitle"})
print("Found %d items." % len(items))
Expand Down

0 comments on commit b5259ea

Please sign in to comment.