Skip to content

fix links for stackoverflow archive #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__
dumps
out*
logs*
test*
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# stackexchange_dataset
A python tool for downloading & processing the [stackexchange data dumps](https://archive.org/details/stackexchange) into a text dataset for Language Models.
The schema documentation of the dump can be found [here](https://meta.stackexchange.com/questions/2677/database-schema-documentation-for-the-public-data-dump-and-sede/2678#2678)

Download the whole processed dataset [here](https://eaidata.bmk.sh/data/stackexchange_dataset.tar)

Expand All @@ -10,6 +11,7 @@ cd stackexchange_dataset
pip install -r requirements.txt
```
# Usage
The default output format (for parsed data) is .zip. To create a lm_dataformat dataset pass option --out_format=lm_dataformat to the following commands.

To download *every* stackexchange dump & parse to text, simply run

Expand Down
14 changes: 6 additions & 8 deletions downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from utils import *
import py7zr


curr_dir = os.path.dirname(__file__)
class Stack_Exchange_Downloader():

def __init__(self, name):
Expand All @@ -22,18 +22,18 @@ def parse_sitesmap(self, sitesmap):
site_name = url.replace(".com", "").replace(".net", "")
download_link = "https://archive.org/download/stackexchange/" + url + ".7z"
if url == "stackoverflow.com":
download_link = "https://archive.org/download/stackexchange/stackoverflow.com-Posts.7z"
download_link = "https://archive.org/download/stackexchange/Stackoverflow.com-Posts.7z"
self.sites[site_name] = {"url" : url, "download" : download_link}

def download(self):
if self.name == "all":
for k in self.sites:
command = "wget {} -P dumps".format(self.sites[k]["download"])
command = "wget {} -P {}/dumps".format(curr_dir, self.sites[k]["download"])
print(command)
if os.system(command):
print('Download for {} failed!'.format(k))
else:
command = "wget {} -P dumps".format(self.sites[self.name]["download"])
command = "wget {} -P {}/dumps".format(curr_dir, self.sites[self.name]["download"])
print(command)
if os.system(command):
print('Download for {} failed!'.format(self.name))
Expand All @@ -45,8 +45,7 @@ def extract(self):
# , mode='r'))
# archive.extractall()
# archive.close()
command = "py7zr x dumps/{} dumps/{}".format(self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""),
k)
command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[k]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, k)
print(command)
if os.system(command):
print('Extraction for {} failed!'.format(k))
Expand All @@ -56,8 +55,7 @@ def extract(self):
# , mode='r'))
# archive.extractall()
# archive.close()
command = "py7zr x dumps/{} dumps/{}".format(self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""),
self.name)
command = "py7zr x {}/dumps/{} {}/dumps/{}".format(curr_dir, self.sites[self.name]["download"].replace("https://archive.org/download/stackexchange/", ""), curr_dir, self.name)
print(command)
if os.system(command):
print('Extraction for {} failed!'.format(self.name))
89 changes: 56 additions & 33 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,59 +7,71 @@
from itertools import repeat
from lm_dataformat import Archive
import zipfile
import os
import json


curr_dir = os.path.dirname(__file__)
def download_and_process_single(name, out_format, min_score, max_responses):
try:
name = name.strip().lower()
os.makedirs("dumps", exist_ok=True)
os.makedirs("{}/dumps".format(curr_dir), exist_ok=True)
s = Stack_Exchange_Downloader(name)
path_to_xml = "dumps/{}/Posts.xml".format(name)
# *.7z files are downloaded from "https://archive.org/download/stackexchange/
if name != "stackoverflow":
path_to_7z = "dumps/{}.7z".format(s.sites[name]["url"])
path_to_7z = "{}/dumps/{}.7z".format(curr_dir,s.sites[name]["url"])
else:
path_to_7z = "dumps/stackoverflow.com-Posts.7z"
out_folder = "out".format(name)
os.makedirs(out_folder, exist_ok=True)
path_to_7z = "{}/dumps/Stackoverflow.com-Posts.7z".format(curr_dir)
if not os.path.isfile(path_to_7z):
# download 7z if it's not downloaded already
s.download()

# *.xml files are extracted from *.7z files using py7zr
path_to_xml = "{}/dumps/{}/Posts.xml".format(curr_dir, name)
if not os.path.isfile(path_to_xml):
# extract 7z if it's not extracted already
s.extract()

out_folder = "{}/out".format(curr_dir)
# out_folder = "{}/../../../suriyagwu/stackexchange/all".format(curr_dir)
os.makedirs(out_folder, exist_ok=True)
os.makedirs("{}/samples".format(out_folder), exist_ok=True)
os.makedirs("{}/misc".format(out_folder), exist_ok=True)
if out_format == "lm_dataformat":
archiver = Archive(out_folder)
elif out_format == "zip":
archiver = zipfile.ZipFile('{}/{}.zip'.format(out_folder, name), 'a')
else:
archiver = None
qa = QA_Pairer(path_to_xml, name=name, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
qa = QA_Pairer(path_to_xml, name=name, out_folder=out_folder, out_format=out_format, archiver=archiver, min_score=min_score, max_responses=max_responses)
qa.main()
if out_format == "lm_dataformat":
archiver.commit(name)
elif out_format == "zip":
archiver.close()
try:
os.remove(path_to_7z)
except FileNotFoundError:
print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
for f in filelist:
os.remove(os.path.join("dumps/{}".format(name), f))

# save qa.questions dictionary data to a file
json.dump(qa.questions, open("{}/misc/{}_unprocessed_questions.json".format(out_folder, name), "w"), indent=4)

# try:
# os.remove(path_to_7z)
# except FileNotFoundError:
# print('ERROR: FileNotFoundError: File {} not found'.format(s.sites[name]["url"]))
# filelist = [f for f in os.listdir("dumps/{}".format(name)) if f.endswith(".xml")]
# for f in filelist:
# os.remove(os.path.join("dumps/{}".format(name), f))
except:
traceback.print_exc()


def main(args):
names = args.names.split(',')
names = args.names.split(',')
if names[0].strip().lower() == "all":
s = Stack_Exchange_Downloader("all")
names = []
for k in s.sites:
names.append(k)
# bring stackoverflow to the front so it is always processed first, since it's the largest
if "stackoverflow" in names:
names.insert(0, names.pop(names.index("stackoverflow")))
print('Removing stackoverflow from the list of sites to process. Process it separately.')
names.pop(names.index("stackoverflow"))
print('Downloading and processing stackexchange dumps for {}'.format(names))
# Download & Process
# init pool with as many CPUs as available
Expand All @@ -70,20 +82,31 @@ def main(args):

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw '
'question-answer pair text dataset for Language Models')
parser.add_argument('--names', help='names of stackexchanges to download, extract & parse, separated by commas. '
'If "all", will download, extract & parse *every* stackoverflow site',
default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
type=str)
parser.add_argument('--out_format', help='format of out file - if you are processing everything this will need to be '
'lm_dataformat, as you will run into number of files per directory limits.',
default="zip",
type=str)
parser.add_argument('--min_score', help='minimum score of a response in order to be included in the dataset. Default 3.',
type=int, default=3)
parser.add_argument('--max_responses', help='maximum number of responses (sorted by score) to include for each question. '
'Default 3.', type=int, default=3)
description='CLI for stackexchange_dataset - A tool for downloading & processing stackexchange dumps in xml form to a raw question-answer pair text dataset for Language Models')
parser.add_argument(
'--names',
help='names of stackexchanges to download, extract & parse, separated by commas. If "all", will download, extract & parse *every* stackoverflow site',
default="3dprinting.stackexchange,3dprinting.meta.stackexchange",
type=str
)
parser.add_argument(
'--out_format',
help='format of out file - if you are processing everything this will need to be lm_dataformat, as you will run into number of files per directory limits.',
default="zip",
type=str
)
parser.add_argument(
'--min_score',
help='minimum score of a response in order to be included in the dataset. Default 3.',
type=int,
default=3
)
parser.add_argument(
'--max_responses',
help='maximum number of responses (sorted by score) to include for each question. Default 100.',
type=int,
default=100
)
args = parser.parse_args()
main(args)

Expand Down
45 changes: 45 additions & 0 deletions main_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
'''
Filter the stackoverflow dataset generated by main.py in lm_dataformat
Edit filter_logic to implement filtering of any meta dict sample
'''
from lm_dataformat import Reader, Archive
import os
import glob

curr_dir = os.path.dirname(__file__)
out_folder = "{}/../../../suriyagwu/stackexchange/stackoverflow".format(curr_dir)
full_data_file = "{}/data_*stackoverflow.jsonl.zst".format(out_folder)

class FilterMeta:
# currently implementing or of filter tags
def __init__(self):
self.filter_tags = ['python']
self.file_suffix = "_"+"_".join(self.filter_tags)

def filter_logic(self,meta):
for tag in self.filter_tags:
if tag in meta['tags']:
return True
return False

new_archive = Archive(out_folder)
filter_meta = FilterMeta()
num_accepted = 0
num_rejected = 0
for filename in glob.glob(full_data_file):
# create a reader object for the file
print("Filtering {}".format(filename))
rdr = Reader(filename)
# iterate over the samples in the file
for doc, meta in rdr.stream_data(get_meta=True):
if filter_meta.filter_logic(meta):
# add the sample to the new archive
new_archive.add_data(doc, meta)
num_accepted += 1
else:
num_rejected += 1

filtered_data_file_name = "stackoverflow"+filter_meta.file_suffix
new_archive.commit(filtered_data_file_name)
print("##### Stats #####")
print(f"num_accepted: {num_accepted}, num_rejected: {num_rejected}")
Loading