Skip to content

Commit

Permalink
Issue #20: Reorganizing the project structure
Browse files Browse the repository at this point in the history
  • Loading branch information
leomurta committed Sep 7, 2019
1 parent 82102d0 commit c11d9a7
Show file tree
Hide file tree
Showing 26 changed files with 178 additions and 148 deletions.
1 change: 1 addition & 0 deletions .idea/db-mining.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions .idea/webResources.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion _config.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

title: DB Mining
description: On the usage of Databases in Open Source Projects
theme: jekyll-theme-cayman
Empty file removed collection/__init__.py
Empty file.
91 changes: 0 additions & 91 deletions extraction/extract.py

This file was deleted.

2 changes: 0 additions & 2 deletions extraction/patterns.txt

This file was deleted.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
14 changes: 12 additions & 2 deletions collection/analyze.ipynb → src/analyze.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@
}
],
"source": [
"from util import ANNOTATED_FILE\n",
"#reads projects from Excel file\n",
"df = pd.read_excel('../docs/annotated.xlsx', keep_default_na=False)\n",
"df = pd.read_excel(ANNOTATED_FILE, keep_default_na=False)\n",
"len(df)"
]
},
Expand Down Expand Up @@ -6990,8 +6991,17 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
13 changes: 6 additions & 7 deletions collection/collect.py → src/collect.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,20 @@
import pandas as pd
import requests

from util import PROJECTS_FILE

# Minimum number of stars
MIN_STARS = 1000

# Maximum number of stars (None for no maximum limit)
MAX_STARS = None

# File to load/save the data
FILE = '../resources/projects.xlsx'


def load():
repositories = dict()
print(f'Loading repositories from {FILE}...', end=' ')
print(f'Loading repositories from {PROJECTS_FILE}...', end=' ')
try:
df = pd.read_excel(FILE, keep_default_na=False)
df = pd.read_excel(PROJECTS_FILE, keep_default_na=False)
for i, row in df.iterrows():
repo = row.to_dict()
repositories[repo['owner'] + '/' + repo['name']] = repo
Expand All @@ -32,14 +31,14 @@ def load():

def save(repositories):
repositories.update(load())
print(f'Saving repositories to {FILE}...', end=' ')
print(f'Saving repositories to {PROJECTS_FILE}...', end=' ')
df = pd.DataFrame(repositories.values())
df.loc[df.description.str.contains('(?i)\\bmirror\\b',
na=False), 'isMirror'] = True # Check 'mirror' in the description
df.createdAt = pd.to_datetime(df.createdAt, infer_datetime_format=True).dt.tz_localize(None)
df.pushedAt = pd.to_datetime(df.pushedAt, infer_datetime_format=True).dt.tz_localize(None)
df.sort_values('stargazers', ascending=False, inplace=True)
df.to_excel(FILE, index=False)
df.to_excel(PROJECTS_FILE, index=False)
print('Done!')


Expand Down
18 changes: 8 additions & 10 deletions utils/database.py → src/database.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import os.path
import sqlite3

DB_FILENAME = '../resources/db-mining.db'
DB_SCRIPT = '../resources/create-database.sql'
from util import DATABASE_FILE, SCHEMA_FILE

db_conn = None # Connection to the database
database_types_dict = None # dictionary with database types
Expand Down Expand Up @@ -148,20 +147,18 @@ def load_existing_data():
def connect():
global db_conn

db_path = os.path.abspath(DB_FILENAME)
new_db = not os.path.exists(db_path)
db_conn = sqlite3.connect(db_path)
new_db = not os.path.exists(DATABASE_FILE)
db_conn = sqlite3.connect(DATABASE_FILE)
if new_db:
print('Creating Database...')
script_path = os.path.abspath(DB_SCRIPT)
f = open(script_path, 'r')
sqlFile = f.read()
f = open(SCHEMA_FILE, 'r')
sql_file = f.read()
f.close()

# all SQL commands (split on ';')
sqlCommands = sqlFile.split(';')
sql_commands = sql_file.split(';')
# Execute every command from the input file
for command in sqlCommands:
for command in sql_commands:
db_conn.execute(command)
db_conn.commit()

Expand Down Expand Up @@ -464,6 +461,7 @@ def delete_project_by_id(project_id):
if project_key in projects_set:
projects_set.remove(project_key) # removes project from set of processed projects


def delete_project_version_by_id(project_version_id):
global projects_versions_dict
sql = 'DELETE FROM project_version WHERE project_version_id = ?'
Expand Down
12 changes: 4 additions & 8 deletions collection/download.py → src/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,12 @@

import pandas as pd

# File to load the data with repositories
REPO_FILE = '../resources/annotated.xlsx'

# Dir to clone/update repositories
REPO_DIR = os.path.abspath('../repos')
from util import ANNOTATED_FILE, REPOS_DIR


def main():
print(f'Loading repositories from {REPO_FILE}.')
df = pd.read_excel(REPO_FILE, keep_default_na=False)
print(f'Loading repositories from {ANNOTATED_FILE}.')
df = pd.read_excel(ANNOTATED_FILE, keep_default_na=False)

print('Removing discarded repositories.')
df = df[df.discardReason == '']
Expand All @@ -23,7 +19,7 @@ def main():
for i, row in df.iterrows():
print(f'Processing repository {row["owner"]}/{row["name"]}.')
source = f'https://github.com/{row["owner"]}/{row["name"]}.git'
target = REPO_DIR + os.sep + row['owner'] + os.sep + row['name']
target = REPOS_DIR + os.sep + row['owner'] + os.sep + row['name']

if os.path.isdir(target):
os.chdir(target)
Expand Down
83 changes: 83 additions & 0 deletions src/extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import os
import subprocess
from collections import namedtuple

import pandas as pd

from util import ANNOTATED_FILE, REPOS_DIR, HEURISTICS_DIR
from util import bold, green, red

# Git grep command
GREP_COMMAND = [
'git',
'grep',
'-I',
'--context=5',
'--break',
'--heading',
'--line-number',
'--color=always',
'--perl-regexp',
'-f'
]


def load_heuristics(directory):
Heuristic = namedtuple('Heuristic', ['type', 'label', 'file'])
heuristics = list()
for label_type in os.scandir(directory):
if label_type.is_dir():
for label in os.scandir(label_type.path):
if label.is_file():
heuristics.append(Heuristic(label_type.name, os.path.splitext(label.name)[0], label.path))
return heuristics


def main():
print(f'Loading repositories from {ANNOTATED_FILE}.')
repos_df = pd.read_excel(ANNOTATED_FILE, keep_default_na=False)
repos_df = repos_df[repos_df.discardReason == ''].reset_index(drop=True)

print(f'Loading heuristics from {HEURISTICS_DIR}.')
heuristics = load_heuristics(HEURISTICS_DIR)

print(f'Processing {len(heuristics)} heuristics over {len(repos_df)} repositories.')
i = 0
for heuristic in heuristics:

# TODO: check if the heuristic already exists in the DB.

print(f'Processing heuristic for {bold(heuristic.label)}.')
for j, repo in repos_df.iterrows():
progress = '{:.2%}'.format((i * len(repos_df) + j) / (len(heuristics) * len(repos_df)))
print(f'\t[{progress}] Repository {repo["owner"]}/{repo["name"]}:', end=' ')

# TODO: check if the heuristic has already been executed over the project.

repo = REPOS_DIR + os.sep + repo['owner'] + os.sep + repo['name']
if os.path.isdir(repo):
os.chdir(repo)
process = subprocess.run(GREP_COMMAND + [heuristic.file], text=True, capture_output=True)

if process.stderr:
print(red('error.'))
print(process.stderr)
exit(1)
else:

# TODO: save process.stdout

print(green('ok.'))
else:
print(red('not found.'))

i += 1

print('Deleting missing heuristics...')
# TODO: remove from the DB the heuristics that were removed from the directory.

print("\nFinished.")


if __name__ == "__main__":
main()
16 changes: 13 additions & 3 deletions collection/filter.ipynb → src/filter.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@
}
],
"source": [
"from util import PROJECTS_FILE, FILTERED_FILE\n",
"#reads projects from Excel file\n",
"df = pd.read_excel('../docs/projects.xlsx', keep_default_na=False)\n",
"df = pd.read_excel(PROJECTS_FILE, keep_default_na=False)\n",
"len(df)"
]
},
Expand Down Expand Up @@ -2136,7 +2137,7 @@
"#removes timezone from dates, since Excel does not know how to handle that\n",
"df.createdAt = pd.to_datetime(df.createdAt).dt.tz_localize(None) \n",
"df.pushedAt = pd.to_datetime(df.pushedAt).dt.tz_localize(None)\n",
"df.to_excel('../docs/filtered.xlsx', index=False)"
"df.to_excel(FILTERED_FILE, index=False)"
]
},
{
Expand Down Expand Up @@ -2997,8 +2998,17 @@
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Loading

0 comments on commit c11d9a7

Please sign in to comment.