EleutherAI · sgunasekar · Feb 10, 2023 · Feb 14, 2023 · Feb 14, 2023 · Feb 14, 2023
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+__pycache__
+dumps
+out*
+logs*
+test*
diff --git a/README.md b/README.md
@@ -1,5 +1,6 @@
 # stackexchange_dataset
 A python tool for downloading & processing the [stackexchange data dumps](https://archive.org/details/stackexchange) into a text dataset for Language Models.
+The schema documentation of the dump can be found [here](https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede/2678#2678)
 
 Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar)
 
@@ -10,6 +11,7 @@ cd stackexchange_dataset
 pip install -r requirements.txt
 ```
 # Usage
+The default output format (for parsed data) is .zip. To create a lm_dataformat dataset pass option --out_format=lm_dataformat to the following commands. 
 
 To download *every* stackexchange dump & parse to text, simply run
 

diff --git a/downloader.py b/downloader.py
@@ -3,7 +3,7 @@
 from utils import *
 import py7zr
 
-
+curr_dir  = os.path.dirname(__file__)
 class Stack_Exchange_Downloader():
 
     def __init__(self, name):
@@ -22,18 +22,18 @@ def parse_sitesmap(self, sitesmap):
             site_name = url.replace(".com", "").replace(".net", "")
             download_link = "https://archive.org/download/stackexchange/" + url + ".7z"
             if url == "stackoverflow.com":
-                download_link = "https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z"
+                download_link = "https://archive.org/download/stackexchange/Stackoverflow.com-Posts.7z"
             self.sites[site_name] = {"url" : url, "download" : download_link}
 
     def download(self):
         if self.name == "all":
             for k in self.sites:
-                command = "wget {} -P dumps".format(self.sites[k]["download"])
+                command = "wget {} -P {}/dumps".format(curr_dir, self.sites[k]["download"])
                 print(command)
                 if os.system(command):
                     print('Download for {} failed!'.format(k))
         else:
-            command = "wget {} -P dumps".format(self.sites[self.name]["download"])
+            command = "wget {} -P {}/dumps".format(curr_dir, self.sites[self.name]["download"])
             print(command)
             if os.system(command):
                 print('Download for {} failed!'.format(self.name))
@@ -45,8 +45,7 @@ def extract(self):
                 #                                                , mode='r'))
                 # archive.extractall()
                 # archive.close()
-                command = "py7zr x dumps/{} dumps/{}".format(self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""),
-                                                       k)
+                command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, k)
                 print(command)
                 if os.system(command):
                     print('Extraction for {} failed!'.format(k))
@@ -56,8 +55,7 @@ def extract(self):
             #                       , mode='r'))
             # archive.extractall()
             # archive.close()
-            command = "py7zr x dumps/{} dumps/{}".format(self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""),
-                                                      self.name)
+            command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, self.name)
             print(command)
             if os.system(command):
                 print('Extraction for {} failed!'.format(self.name))
diff --git a/main.py b/main.py
@@ -7,59 +7,71 @@
 from itertools import repeat
 from lm_dataformat import Archive
 import zipfile
+import os
+import json
 
-
+curr_dir  = os.path.dirname(__file__)
 def download_and_process_single(name, out_format, min_score, max_responses):
     try:
         name = name.strip().lower()
-        os.makedirs("dumps", exist_ok=True)
+        os.makedirs("{}/dumps".format(curr_dir), exist_ok=True)
         s = Stack_Exchange_Downloader(name)
-        path_to_xml = "dumps/{}/Posts.xml".format(name)
+        # *.7z files are downloaded from "https://archive.org/download/stackexchange/ 
         if name != "stackoverflow":
-            path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
+            path_to_7z = "{}/dumps/{}.7z".format(curr_dir,s.sites[name]["url"])
         else:
-            path_to_7z = "dumps/stackoverflow.com-Posts.7z"
-        out_folder = "out".format(name)
-        os.makedirs(out_folder, exist_ok=True)
+            path_to_7z = "{}/dumps/Stackoverflow.com-Posts.7z".format(curr_dir)
         if not os.path.isfile(path_to_7z):
             # download 7z if it's not downloaded already
             s.download()
+
+        # *.xml files are extracted from *.7z files using py7zr
+        path_to_xml = "{}/dumps/{}/Posts.xml".format(curr_dir, name)
         if not os.path.isfile(path_to_xml):
             # extract 7z if it's not extracted already
             s.extract()
+
+        out_folder = "{}/out".format(curr_dir)
+        # out_folder = "{}/../../../suriyagwu/stackexchange/all".format(curr_dir)
+        os.makedirs(out_folder, exist_ok=True)
+        os.makedirs("{}/samples".format(out_folder), exist_ok=True)
+        os.makedirs("{}/misc".format(out_folder), exist_ok=True)
         if out_format == "lm_dataformat":
             archiver = Archive(out_folder)
         elif out_format == "zip":
             archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a')
         else:
             archiver = None
-        qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
+        qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
         qa.main()
         if out_format == "lm_dataformat":
             archiver.commit(name)
         elif out_format == "zip":
             archiver.close()
-        try:
-            os.remove(path_to_7z)
-        except FileNotFoundError:
-            print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
-        filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
-        for f in filelist:
-            os.remove(os.path.join("dumps/{}".format(name), f))
+
+        # save qa.questions dictionary data to a file
+        json.dump(qa.questions, open("{}/misc/{}_unprocessed_questions.json".format(out_folder, name), "w"), indent=4)
+
+        # try:
+        #     os.remove(path_to_7z)
+        # except FileNotFoundError:
+        #     print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
+        # filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
+        # for f in filelist:
+        #     os.remove(os.path.join("dumps/{}".format(name), f))
     except:
         traceback.print_exc()
 
 
 def main(args):
-    names = args.names.split(',')
+    names = args.names.split(',')    
     if names[0].strip().lower() == "all":
         s = Stack_Exchange_Downloader("all")
         names = []
         for k in s.sites:
             names.append(k)
-        # bring stackoverflow to the front so it is always processed first, since it's the largest
-        if "stackoverflow" in names:
-            names.insert(0, names.pop(names.index("stackoverflow")))
+        print('Removing stackoverflow from the list of sites to process. Process it separately.')
+        names.pop(names.index("stackoverflow"))        
     print('Downloading and processing stackexchange dumps for {}'.format(names))
     # Download & Process
     # init pool with as many CPUs as available
@@ -70,20 +82,31 @@ def main(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw '
-                    'question-answer pair text dataset for Language Models')
-    parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. '
-                                        'If "all", will download, extract & parse *every* stackoverflow site',
-                        default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
-                        type=str)
-    parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be '
-                                             'lm_dataformat, as you will run into number of files per directory limits.',
-                        default="zip",
-                        type=str)
-    parser.add_argument('--min_score', help='minimum score of a response in order to be included in the dataset. Default 3.',
-                        type=int, default=3)
-    parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
-                                                'Default 3.', type=int, default=3)
+        description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw question-answer pair text dataset for Language Models')
+    parser.add_argument(
+        '--names', 
+        help='names of stackexchanges to download, extract & parse, separated by commas. If "all", will download, extract & parse *every* stackoverflow site',
+        default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
+        type=str
+    )
+    parser.add_argument(
+        '--out_format', 
+        help='format of out file - if you are processing everything this will need to be lm_dataformat, as you will run into number of files per directory limits.',
+        default="zip",
+        type=str
+    )
+    parser.add_argument(
+        '--min_score', 
+        help='minimum score of a response in order to be included in the dataset. Default 3.',
+        type=int, 
+        default=3
+    )
+    parser.add_argument(
+        '--max_responses', 
+        help='maximum number of responses (sorted by score) to include for each question. Default 100.', 
+        type=int, 
+        default=100
+    )
     args = parser.parse_args()
     main(args)
 

diff --git a/main_filter.py b/main_filter.py
@@ -0,0 +1,45 @@
+'''
+Filter the stackoverflow dataset generated by main.py in lm_dataformat
+Edit filter_logic to implement filtering of any meta dict sample
+'''
+from lm_dataformat import Reader, Archive
+import os
+import glob
+
+curr_dir  = os.path.dirname(__file__)
+out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir)
+full_data_file = "{}/data_*stackoverflow.jsonl.zst".format(out_folder)
+
+class FilterMeta:
+    # currently implementing or of filter tags    
+    def __init__(self):
+        self.filter_tags = ['python']
+        self.file_suffix = "_"+"_".join(self.filter_tags)
+
+    def filter_logic(self,meta):    
+        for tag in self.filter_tags:
+            if tag in meta['tags']:
+                return True
+        return False
+
+new_archive = Archive(out_folder)
+filter_meta = FilterMeta()
+num_accepted = 0
+num_rejected = 0
+for filename in glob.glob(full_data_file):
+    # create a reader object for the file
+    print("Filtering {}".format(filename))
+    rdr = Reader(filename)
+    # iterate over the samples in the file
+    for doc, meta in rdr.stream_data(get_meta=True):
+        if filter_meta.filter_logic(meta):
+            # add the sample to the new archive
+            new_archive.add_data(doc, meta)
+            num_accepted += 1
+        else:
+            num_rejected += 1
+
+filtered_data_file_name = "stackoverflow"+filter_meta.file_suffix
+new_archive.commit(filtered_data_file_name)
+print("##### Stats #####")
+print(f"num_accepted: {num_accepted}, num_rejected: {num_rejected}")