Skip to content

Commit

Permalink
Changes to file paths, handling missing summaries, updated README
Browse files Browse the repository at this point in the history
  • Loading branch information
jigsaw2212 committed Jun 5, 2021
1 parent b5259ea commit f94c7fb
Show file tree
Hide file tree
Showing 10 changed files with 13,031 additions and 13,026 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,24 +103,26 @@ python clean_summaries.py
#### Data Alignments
Generating paragraph alignments from the chapter-level-summary-alignments, is performed individually for the train/test/val splits:

Gather the data from the summaries and book chapters into a single jsonl
Gather the data from the summaries and book chapters into a single jsonl. The script needs to be run separately for each split as the matched file
```
python paragraph-level-summary-alignments/gather_data.py --split_paragraphs
cd paragraph-level-summary-alignments
python gather_data.py --matched_file /path/to/chapter_summary_aligned_{train/test/val}_split.jsonl --split_paragraphs
```

Generate alignments of the paragraphs with sentences from the summary using the bi-encoder **paraphrase-distilroberta-base-v1**
```
python paragraph-level-summary-alignments/align_data_bi_encoder_paraphrase.py --stable_alignment
python align_data_bi_encoder_paraphrase.py --data_path /path/to/chapter_summary_aligned_{train/test/val}_split.jsonl.gathered --stable_alignment
```

Aggregate the generated alignments for cases where multiple sentences from chapter-summaries are matched to the same paragraph from the book
Aggregate the generated alignments for cases where multiple sentences from a chapter summary are matched to the same paragraph from the book chapter
```
python paragraph-level-summary-alignments/aggregate_paragraph_alignments_bi_encoder_paraphrase.py
python aggregate_paragraph_alignments_bi_encoder_paraphrase.py --file train/test/val
```

## Troubleshooting
1. The web archive links we collect the summaries from can often be unreliable, taking a long time to load. One way to fix this is to use higher sleep timeouts when one of the links throws an exception, which has been implemented in some of the scripts.
2. Some links that constantly throw errors are aggregated in a file called - 'section_errors.txt'. This is useful to inspect which links are actually unavailable and re-running the data collection scripts for those specific links.
3. Some paths in the provided files might throw errors depending on where the chapterized books have been downloaded. It is recommended to download them in booksum root directory for the scripts to work without requiring any modifications to the paths.


## Get Involved
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

3,156 changes: 1,578 additions & 1,578 deletions alignments/chapter-level-summary-alignments/chapter_summary_aligned_test_split.jsonl

Large diffs are not rendered by default.

19,262 changes: 9,631 additions & 9,631 deletions alignments/chapter-level-summary-alignments/chapter_summary_aligned_train_split.jsonl

Large diffs are not rendered by default.

2,784 changes: 1,392 additions & 1,392 deletions alignments/chapter-level-summary-alignments/chapter_summary_aligned_val_split.jsonl

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def main(args):

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--file', type=str, required=True)
parser.add_argument('--file', type=str, required=True, choices=["train", "test", "val"])
args = parser.parse_args()

main(args)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""
Script used to generate bi-encoder paraphrase alignment of the paragraphs with sentences from the summary
Script used to generate bi-encoder paraphrase alignment of the paragraphs with sentences from the summary.
It is recommended to run this script on a GPU machine.
"""

#!/usr/bin/env python
Expand Down Expand Up @@ -32,6 +33,8 @@
from transformers import AutoTokenizer
from nltk.tokenize import word_tokenize, sent_tokenize

from sentence_transformers import SentenceTransformer, util

# change recursion limit
sys.setrecursionlimit(5000)

Expand Down Expand Up @@ -172,9 +175,6 @@ def gather_data(alignments_bi_encoder_paraphrase, paragraphs, summaries, similar

examples.append(example)

# pp.pprint(example)
# print ("============================")

return examples


Expand Down Expand Up @@ -223,7 +223,7 @@ def main(args):

# compute similarities
#Initially we tried both roberta and paraphrase bi encoder
similarity_matrix_bi_encoder_paraphrase, similarity_matrix_bi_encoder_roberta = compute_similarities_bi_encoder(paragraphs, summaries)
similarity_matrix_bi_encoder_paraphrase = compute_similarities_bi_encoder(paragraphs, summaries)

# For all our experimental results, we perform stable alignment
if args.stable_alignment:
Expand All @@ -236,7 +236,7 @@ def main(args):

# visualize_alignments(similarity_matrix_bi_encoder_paraphrase, stable_alignments_bi_encoder_paraphrase, title, args.output_dir)

with open(basename(args.data_path) + ".stable.bi_encoder_paraphrase", "a") as fd:
with open(basename(args.data_path) + ".stable.bi_encoder_paraphrase", "w") as fd:
for stable_example in stable_examples:
fd.write(json.dumps(stable_example) + "\n")

Expand All @@ -247,12 +247,10 @@ def main(args):
greedy_alignments = align_data_greedy_matching(similarity_matrix_bi_encoder_paraphrase)
greedy_examples = gather_data(greedy_alignments, paragraphs, summaries, similarity_matrix_bi_encoder_paraphrase, title)

with open(basename(args.data_path) + ".greedy", "a") as fd:
with open(basename(args.data_path) + ".greedy", "w") as fd:
for greedy_example in greedy_examples:
fd.write(json.dumps(greedy_example) + "\n")

# break

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, help='path to input data file')
Expand Down
15 changes: 10 additions & 5 deletions alignments/paragraph-level-summary-alignments/gather_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@

from tqdm import tqdm




def fix_leftover_headers(summary_content):
"""
Function removes leftover prefixes from the summary text, such as: Chapter 1, Chapter V, Analysis, etc.
Expand Down Expand Up @@ -173,10 +170,13 @@ def main(args):

spacy_nlp = spacy.load('en_core_web_lg', disable=["tagger", "ner", "textcat","lemmatizer"])

CHAPTERIZED_BOOKS_DIR = "../../"
FINISHED_SUMMARIES_DIR = "../../scripts/"

# gather data
processed_data = []
for example in tqdm(raw_data):
with open(example["chapter_path"]) as fd:
with open(os.path.join(CHAPTERIZED_BOOKS_DIR, example["chapter_path"])) as fd:
if args.join_strings:
chapter_content = " ".join([line.strip() for line in fd.readlines()])
elif args.split_paragraphs:
Expand All @@ -186,7 +186,12 @@ def main(args):
else:
raise RuntimeError("Unknown processing option")

with open(example["summary_path"]) as fd:
summary_path = os.path.join(FINISHED_SUMMARIES_DIR, example["summary_path"])

if not os.path.exists(summary_path):
continue

with open(summary_path) as fd:
if args.join_strings:
summary_content = " ".join([line.strip() for line in json.loads(fd.read())["summary"]])
elif args.split_paragraphs:
Expand Down

0 comments on commit f94c7fb

Please sign in to comment.