Skip to content

Commit

Permalink
Add notebooks to generate and process the dataset of PR review comments
Browse files Browse the repository at this point in the history
Signed-off-by: Waren Long <waren@sourced.tech>
  • Loading branch information
Waren Long committed Mar 25, 2019
1 parent 554aaba commit 3813272
Show file tree
Hide file tree
Showing 4 changed files with 500 additions and 3 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ This repository contains all the needed tools and scripts to reproduce the datas

- [PR review comments](ReviewComments)
- Size: 1.6GB
- Description: 25.3 million PR review comments from GitHub since January 2015 till December 2018.
- Description: 25.3 million GitHub PR review comments since January 2015 till December 2018.

## Contributions

Expand Down
243 changes: 243 additions & 0 deletions ReviewComments/PR_review_comments_generation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import gzip\n",
"import json\n",
"import os\n",
"import pandas as pd\n",
"import sys"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"DATA_DIR = \"/home/waren/sourced/data/GH_archive/\"\n",
"OUTPUT_DIR = \"/home/waren/sourced/data/datasets\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"--2019-03-25 15:46:22-- http://data.gharchive.org/2015-01-01-0.json.gz\n",
"Resolving data.gharchive.org (data.gharchive.org)... 104.27.165.156, 104.27.164.156, 2606:4700:30::681b:a59c, ...\n",
"Connecting to data.gharchive.org (data.gharchive.org)|104.27.165.156|:80... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 2744253 (2.6M) [application/gzip]\n",
"Saving to: ‘/home/waren/sourced/data/GH_archive/2015-01-01-0.json.gz.1’\n",
"\n",
"2015-01-01-0.json.g 100%[===================>] 2.62M 8.37MB/s in 0.3s \n",
"\n",
"2019-03-25 15:46:23 (8.37 MB/s) - ‘/home/waren/sourced/data/GH_archive/2015-01-01-0.json.gz.1’ saved [2744253/2744253]\n",
"\n"
]
}
],
"source": [
"# approx 500 GB on disk\n",
"#!wget http://data.gharchive.org/{2015..2018}-{01..12}-{01..31}-{0..23}.json.gz\n",
"!wget http://data.gharchive.org/2015-01-01-0.json.gz -P $DATA_DIR"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"0\r",
"1\r"
]
}
],
"source": [
"COLUMNS = [\"COMMENT_ID\", \"COMMIT_ID\", \"URL\", \"AUTHOR\", \"CREATED_AT\", \"BODY\"]\n",
"PATH_TO_CSV = os.path.join(OUTPUT_DIR, \"PR_comments.csv\")\n",
"for root, d, files in os.walk(DATA_DIR):\n",
" for i, f in enumerate(files):\n",
" comments_list = []\n",
" comments = pd.DataFrame(columns = COLUMNS)\n",
" sys.stderr.write(\"%d\\r\" % i)\n",
" path = os.path.join(root, f)\n",
" if path.endswith(\".gz\"):\n",
" try:\n",
" for line in gzip.open(path).readlines():\n",
" event = json.loads(line)\n",
" if event[\"type\"] == \"PullRequestReviewCommentEvent\":\n",
" comments_list.append([event[\"payload\"][\"comment\"][\"id\"],\n",
" event[\"payload\"][\"comment\"][\"commit_id\"],\n",
" event[\"payload\"][\"comment\"][\"html_url\"],\n",
" event[\"payload\"][\"comment\"][\"user\"][\"login\"],\n",
" event[\"payload\"][\"comment\"][\"created_at\"],\n",
" event[\"payload\"][\"comment\"][\"body\"]])\n",
" except EOFError:\n",
" continue\n",
" df = pd.DataFrame(comments_list, columns=COLUMNS)\n",
" comments = comments.append(df, ignore_index=True)\n",
" comments.to_csv(PATH_TO_CSV, mode=\"a\", header=(not os.path.exists(PATH_TO_CSV)), index=False)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>COMMENT_ID</th>\n",
" <th>COMMIT_ID</th>\n",
" <th>URL</th>\n",
" <th>AUTHOR</th>\n",
" <th>CREATED_AT</th>\n",
" <th>BODY</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>22396772</td>\n",
" <td>47c1cd384b3deb0427e7e546e8d6b4346a65d816</td>\n",
" <td>https://github.com/selfhub/selfhub/pull/124#di...</td>\n",
" <td>zindlerb</td>\n",
" <td>2015-01-01T00:00:08Z</td>\n",
" <td>:sparkles: :pray: :sparkles:</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>22396775</td>\n",
" <td>a12155068a4feb03bf6b149b60830154ff52db0f</td>\n",
" <td>https://github.com/tsuru/tsuru/pull/1044#discu...</td>\n",
" <td>msabramo</td>\n",
" <td>2015-01-01T00:00:53Z</td>\n",
" <td>&gt; Can you remove this empty line?\\r\\n\\r\\nDone.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>22396779</td>\n",
" <td>4c31e9b700de04c01f07c3f7d12afccbbf47237e</td>\n",
" <td>https://github.com/selfhub/selfhub/pull/125#di...</td>\n",
" <td>zindlerb</td>\n",
" <td>2015-01-01T00:01:44Z</td>\n",
" <td>_.reduce would be more elegant, but your call</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>22396780</td>\n",
" <td>2a0664c165aa3d1a4fec06d720d4ba393fefc7ae</td>\n",
" <td>https://github.com/emberjs/data/pull/2649#disc...</td>\n",
" <td>rwjblue</td>\n",
" <td>2015-01-01T00:01:54Z</td>\n",
" <td>Tiny nit-pick: can you name this something mea...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>22396794</td>\n",
" <td>c431d49f3c1751d1c32ec1c122a7a1eeb06774a6</td>\n",
" <td>https://github.com/edx/edx-platform/pull/6399#...</td>\n",
" <td>bradenmacdonald</td>\n",
" <td>2015-01-01T00:02:33Z</td>\n",
" <td>Yeah, I will just remove that. It is always True.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" COMMENT_ID COMMIT_ID \\\n",
"0 22396772 47c1cd384b3deb0427e7e546e8d6b4346a65d816 \n",
"1 22396775 a12155068a4feb03bf6b149b60830154ff52db0f \n",
"2 22396779 4c31e9b700de04c01f07c3f7d12afccbbf47237e \n",
"3 22396780 2a0664c165aa3d1a4fec06d720d4ba393fefc7ae \n",
"4 22396794 c431d49f3c1751d1c32ec1c122a7a1eeb06774a6 \n",
"\n",
" URL AUTHOR \\\n",
"0 https://github.com/selfhub/selfhub/pull/124#di... zindlerb \n",
"1 https://github.com/tsuru/tsuru/pull/1044#discu... msabramo \n",
"2 https://github.com/selfhub/selfhub/pull/125#di... zindlerb \n",
"3 https://github.com/emberjs/data/pull/2649#disc... rwjblue \n",
"4 https://github.com/edx/edx-platform/pull/6399#... bradenmacdonald \n",
"\n",
" CREATED_AT BODY \n",
"0 2015-01-01T00:00:08Z :sparkles: :pray: :sparkles: \n",
"1 2015-01-01T00:00:53Z > Can you remove this empty line?\\r\\n\\r\\nDone. \n",
"2 2015-01-01T00:01:44Z _.reduce would be more elegant, but your call \n",
"3 2015-01-01T00:01:54Z Tiny nit-pick: can you name this something mea... \n",
"4 2015-01-01T00:02:33Z Yeah, I will just remove that. It is always True. "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"comments = pd.read_csv(PATH_TO_CSV)\n",
"comments.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 3813272

Please sign in to comment.