Changes to file paths, handling missing summaries, updated README

salesforce · Jun 5, 2021 · f94c7fb · f94c7fb
1 parent b5259ea
commit f94c7fb
Show file tree

Hide file tree

Showing 10 changed files with 13,031 additions and 13,026 deletions.
diff --git a/README.md b/README.md
@@ -103,24 +103,26 @@ python clean_summaries.py
 #### Data Alignments
 Generating paragraph alignments from the chapter-level-summary-alignments, is performed individually for the train/test/val splits:
 
-Gather the data from the summaries and book chapters into a single jsonl
+Gather the data from the summaries and book chapters into a single jsonl. The script needs to be run separately for each split as the matched file
 ```
-python paragraph-level-summary-alignments/gather_data.py --split_paragraphs
+cd paragraph-level-summary-alignments
+python gather_data.py --matched_file /path/to/chapter_summary_aligned_{train/test/val}_split.jsonl --split_paragraphs
 ```
 
 Generate alignments of the paragraphs with sentences from the summary using the bi-encoder **paraphrase-distilroberta-base-v1**
 ```
-python paragraph-level-summary-alignments/align_data_bi_encoder_paraphrase.py --stable_alignment
+python align_data_bi_encoder_paraphrase.py --data_path /path/to/chapter_summary_aligned_{train/test/val}_split.jsonl.gathered --stable_alignment
 ```
 
-Aggregate the generated alignments for cases where multiple sentences from chapter-summaries are matched to the same paragraph from the book
+Aggregate the generated alignments for cases where multiple sentences from a chapter summary are matched to the same paragraph from the book chapter
 ```
-python paragraph-level-summary-alignments/aggregate_paragraph_alignments_bi_encoder_paraphrase.py
+python aggregate_paragraph_alignments_bi_encoder_paraphrase.py --file train/test/val
 ```
 
 ## Troubleshooting
 1. The web archive links we collect the summaries from can often be unreliable, taking a long time to load. One way to fix this is to use higher sleep timeouts when one of the links throws an exception, which has been implemented in some of the scripts.
 2. Some links that constantly throw errors are aggregated in a file called - 'section_errors.txt'. This is useful to inspect which links are actually unavailable and re-running the data collection scripts for those specific links.
+3. Some paths in the provided files might throw errors depending on where the chapterized books have been downloaded. It is recommended to download them in booksum root directory for the scripts to work without requiring any modifications to the paths.
 
 
 ## Get Involved

diff --git a/alignments/book-level-summary-alignments/book_summaries_aligned_test.jsonl b/alignments/book-level-summary-alignments/book_summaries_aligned_test.jsonl
diff --git a/alignments/book-level-summary-alignments/book_summaries_aligned_train.jsonl b/alignments/book-level-summary-alignments/book_summaries_aligned_train.jsonl
diff --git a/alignments/book-level-summary-alignments/book_summaries_aligned_val.jsonl b/alignments/book-level-summary-alignments/book_summaries_aligned_val.jsonl
diff --git a/alignments/chapter-level-summary-alignments/chapter_summary_aligned_test_split.jsonl b/alignments/chapter-level-summary-alignments/chapter_summary_aligned_test_split.jsonl
diff --git a/alignments/chapter-level-summary-alignments/chapter_summary_aligned_train_split.jsonl b/alignments/chapter-level-summary-alignments/chapter_summary_aligned_train_split.jsonl
diff --git a/alignments/chapter-level-summary-alignments/chapter_summary_aligned_val_split.jsonl b/alignments/chapter-level-summary-alignments/chapter_summary_aligned_val_split.jsonl
diff --git a/...aragraph-level-summary-alignments/aggregate_paragraph_alignments_bi_encoder_paraphrase.py b/...aragraph-level-summary-alignments/aggregate_paragraph_alignments_bi_encoder_paraphrase.py
@@ -51,7 +51,7 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--file', type=str, required=True)
+    parser.add_argument('--file', type=str, required=True, choices=["train", "test", "val"])
     args = parser.parse_args()
 
     main(args)

diff --git a/alignments/paragraph-level-summary-alignments/align_data_bi_encoder_paraphrase.py b/alignments/paragraph-level-summary-alignments/align_data_bi_encoder_paraphrase.py
@@ -1,5 +1,6 @@
 """
-Script used to generate bi-encoder paraphrase alignment of the paragraphs with sentences from the summary
+Script used to generate bi-encoder paraphrase alignment of the paragraphs with sentences from the summary.
+It is recommended to run this script on a GPU machine.
 """
 
 #!/usr/bin/env python
@@ -32,6 +33,8 @@
 from transformers import AutoTokenizer
 from nltk.tokenize import word_tokenize, sent_tokenize
 
+from sentence_transformers import SentenceTransformer, util
+
 # change recursion limit
 sys.setrecursionlimit(5000)
 
@@ -172,9 +175,6 @@ def gather_data(alignments_bi_encoder_paraphrase, paragraphs, summaries, similar
 
         examples.append(example)
 
-        # pp.pprint(example)
-        # print ("============================")
-
     return examples
 
 
@@ -223,7 +223,7 @@ def main(args):
 
         # compute similarities
         #Initially we tried both roberta and paraphrase bi encoder
-        similarity_matrix_bi_encoder_paraphrase, similarity_matrix_bi_encoder_roberta = compute_similarities_bi_encoder(paragraphs, summaries)
+        similarity_matrix_bi_encoder_paraphrase = compute_similarities_bi_encoder(paragraphs, summaries)
 
         # For all our experimental results, we perform stable alignment        
         if args.stable_alignment:
@@ -236,7 +236,7 @@ def main(args):
 
             # visualize_alignments(similarity_matrix_bi_encoder_paraphrase, stable_alignments_bi_encoder_paraphrase, title, args.output_dir)
 
-            with open(basename(args.data_path) + ".stable.bi_encoder_paraphrase", "a") as fd:
+            with open(basename(args.data_path) + ".stable.bi_encoder_paraphrase", "w") as fd:
                 for stable_example in stable_examples:
                     fd.write(json.dumps(stable_example) + "\n")
 
@@ -247,12 +247,10 @@ def main(args):
             greedy_alignments = align_data_greedy_matching(similarity_matrix_bi_encoder_paraphrase)
             greedy_examples = gather_data(greedy_alignments, paragraphs, summaries, similarity_matrix_bi_encoder_paraphrase, title)
 
-            with open(basename(args.data_path) + ".greedy", "a") as fd:
+            with open(basename(args.data_path) + ".greedy", "w") as fd:
                 for greedy_example in greedy_examples:
                     fd.write(json.dumps(greedy_example) + "\n")
 
-        # break
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--data_path', type=str, help='path to input data file')

diff --git a/alignments/paragraph-level-summary-alignments/gather_data.py b/alignments/paragraph-level-summary-alignments/gather_data.py
@@ -11,9 +11,6 @@
 
 from tqdm import tqdm
 
-
-
-
 def fix_leftover_headers(summary_content):
     """
     Function removes leftover prefixes from the summary text, such as: Chapter 1, Chapter V, Analysis, etc.
@@ -173,10 +170,13 @@ def main(args):
 
     spacy_nlp = spacy.load('en_core_web_lg', disable=["tagger", "ner", "textcat","lemmatizer"])
 
+    CHAPTERIZED_BOOKS_DIR = "../../"
+    FINISHED_SUMMARIES_DIR = "../../scripts/"
+
     # gather data
     processed_data = []
     for example in tqdm(raw_data):
-        with open(example["chapter_path"]) as fd:
+        with open(os.path.join(CHAPTERIZED_BOOKS_DIR, example["chapter_path"])) as fd:
             if args.join_strings:
                 chapter_content = " ".join([line.strip() for line in fd.readlines()])
             elif args.split_paragraphs:
@@ -186,7 +186,12 @@ def main(args):
             else:
                 raise RuntimeError("Unknown processing option")
 
-        with open(example["summary_path"]) as fd:
+        summary_path = os.path.join(FINISHED_SUMMARIES_DIR, example["summary_path"])
+
+        if not os.path.exists(summary_path):
+            continue
+
+        with open(summary_path) as fd:
             if args.join_strings:
                 summary_content = " ".join([line.strip() for line in json.loads(fd.read())["summary"]])
             elif args.split_paragraphs: