-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add notebooks to generate and process the dataset of PR review comments
Signed-off-by: Waren Long <waren@sourced.tech>
- Loading branch information
Waren Long
committed
Mar 25, 2019
1 parent
554aaba
commit 3813272
Showing
4 changed files
with
500 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,243 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import csv\n", | ||
"import gzip\n", | ||
"import json\n", | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"import sys" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"DATA_DIR = \"/home/waren/sourced/data/GH_archive/\"\n", | ||
"OUTPUT_DIR = \"/home/waren/sourced/data/datasets\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"--2019-03-25 15:46:22-- http://data.gharchive.org/2015-01-01-0.json.gz\n", | ||
"Resolving data.gharchive.org (data.gharchive.org)... 104.27.165.156, 104.27.164.156, 2606:4700:30::681b:a59c, ...\n", | ||
"Connecting to data.gharchive.org (data.gharchive.org)|104.27.165.156|:80... connected.\n", | ||
"HTTP request sent, awaiting response... 200 OK\n", | ||
"Length: 2744253 (2.6M) [application/gzip]\n", | ||
"Saving to: ‘/home/waren/sourced/data/GH_archive/2015-01-01-0.json.gz.1’\n", | ||
"\n", | ||
"2015-01-01-0.json.g 100%[===================>] 2.62M 8.37MB/s in 0.3s \n", | ||
"\n", | ||
"2019-03-25 15:46:23 (8.37 MB/s) - ‘/home/waren/sourced/data/GH_archive/2015-01-01-0.json.gz.1’ saved [2744253/2744253]\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# approx 500 GB on disk\n", | ||
"#!wget http://data.gharchive.org/{2015..2018}-{01..12}-{01..31}-{0..23}.json.gz\n", | ||
"!wget http://data.gharchive.org/2015-01-01-0.json.gz -P $DATA_DIR" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"0\r", | ||
"1\r" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"COLUMNS = [\"COMMENT_ID\", \"COMMIT_ID\", \"URL\", \"AUTHOR\", \"CREATED_AT\", \"BODY\"]\n", | ||
"PATH_TO_CSV = os.path.join(OUTPUT_DIR, \"PR_comments.csv\")\n", | ||
"for root, d, files in os.walk(DATA_DIR):\n", | ||
" for i, f in enumerate(files):\n", | ||
" comments_list = []\n", | ||
" comments = pd.DataFrame(columns = COLUMNS)\n", | ||
" sys.stderr.write(\"%d\\r\" % i)\n", | ||
" path = os.path.join(root, f)\n", | ||
" if path.endswith(\".gz\"):\n", | ||
" try:\n", | ||
" for line in gzip.open(path).readlines():\n", | ||
" event = json.loads(line)\n", | ||
" if event[\"type\"] == \"PullRequestReviewCommentEvent\":\n", | ||
" comments_list.append([event[\"payload\"][\"comment\"][\"id\"],\n", | ||
" event[\"payload\"][\"comment\"][\"commit_id\"],\n", | ||
" event[\"payload\"][\"comment\"][\"html_url\"],\n", | ||
" event[\"payload\"][\"comment\"][\"user\"][\"login\"],\n", | ||
" event[\"payload\"][\"comment\"][\"created_at\"],\n", | ||
" event[\"payload\"][\"comment\"][\"body\"]])\n", | ||
" except EOFError:\n", | ||
" continue\n", | ||
" df = pd.DataFrame(comments_list, columns=COLUMNS)\n", | ||
" comments = comments.append(df, ignore_index=True)\n", | ||
" comments.to_csv(PATH_TO_CSV, mode=\"a\", header=(not os.path.exists(PATH_TO_CSV)), index=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>COMMENT_ID</th>\n", | ||
" <th>COMMIT_ID</th>\n", | ||
" <th>URL</th>\n", | ||
" <th>AUTHOR</th>\n", | ||
" <th>CREATED_AT</th>\n", | ||
" <th>BODY</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>22396772</td>\n", | ||
" <td>47c1cd384b3deb0427e7e546e8d6b4346a65d816</td>\n", | ||
" <td>https://github.com/selfhub/selfhub/pull/124#di...</td>\n", | ||
" <td>zindlerb</td>\n", | ||
" <td>2015-01-01T00:00:08Z</td>\n", | ||
" <td>:sparkles: :pray: :sparkles:</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>22396775</td>\n", | ||
" <td>a12155068a4feb03bf6b149b60830154ff52db0f</td>\n", | ||
" <td>https://github.com/tsuru/tsuru/pull/1044#discu...</td>\n", | ||
" <td>msabramo</td>\n", | ||
" <td>2015-01-01T00:00:53Z</td>\n", | ||
" <td>> Can you remove this empty line?\\r\\n\\r\\nDone.</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>22396779</td>\n", | ||
" <td>4c31e9b700de04c01f07c3f7d12afccbbf47237e</td>\n", | ||
" <td>https://github.com/selfhub/selfhub/pull/125#di...</td>\n", | ||
" <td>zindlerb</td>\n", | ||
" <td>2015-01-01T00:01:44Z</td>\n", | ||
" <td>_.reduce would be more elegant, but your call</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>22396780</td>\n", | ||
" <td>2a0664c165aa3d1a4fec06d720d4ba393fefc7ae</td>\n", | ||
" <td>https://github.com/emberjs/data/pull/2649#disc...</td>\n", | ||
" <td>rwjblue</td>\n", | ||
" <td>2015-01-01T00:01:54Z</td>\n", | ||
" <td>Tiny nit-pick: can you name this something mea...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>22396794</td>\n", | ||
" <td>c431d49f3c1751d1c32ec1c122a7a1eeb06774a6</td>\n", | ||
" <td>https://github.com/edx/edx-platform/pull/6399#...</td>\n", | ||
" <td>bradenmacdonald</td>\n", | ||
" <td>2015-01-01T00:02:33Z</td>\n", | ||
" <td>Yeah, I will just remove that. It is always True.</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" COMMENT_ID COMMIT_ID \\\n", | ||
"0 22396772 47c1cd384b3deb0427e7e546e8d6b4346a65d816 \n", | ||
"1 22396775 a12155068a4feb03bf6b149b60830154ff52db0f \n", | ||
"2 22396779 4c31e9b700de04c01f07c3f7d12afccbbf47237e \n", | ||
"3 22396780 2a0664c165aa3d1a4fec06d720d4ba393fefc7ae \n", | ||
"4 22396794 c431d49f3c1751d1c32ec1c122a7a1eeb06774a6 \n", | ||
"\n", | ||
" URL AUTHOR \\\n", | ||
"0 https://github.com/selfhub/selfhub/pull/124#di... zindlerb \n", | ||
"1 https://github.com/tsuru/tsuru/pull/1044#discu... msabramo \n", | ||
"2 https://github.com/selfhub/selfhub/pull/125#di... zindlerb \n", | ||
"3 https://github.com/emberjs/data/pull/2649#disc... rwjblue \n", | ||
"4 https://github.com/edx/edx-platform/pull/6399#... bradenmacdonald \n", | ||
"\n", | ||
" CREATED_AT BODY \n", | ||
"0 2015-01-01T00:00:08Z :sparkles: :pray: :sparkles: \n", | ||
"1 2015-01-01T00:00:53Z > Can you remove this empty line?\\r\\n\\r\\nDone. \n", | ||
"2 2015-01-01T00:01:44Z _.reduce would be more elegant, but your call \n", | ||
"3 2015-01-01T00:01:54Z Tiny nit-pick: can you name this something mea... \n", | ||
"4 2015-01-01T00:02:33Z Yeah, I will just remove that. It is always True. " | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"comments = pd.read_csv(PATH_TO_CSV)\n", | ||
"comments.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.