Issue #20: Reorganizing the project structure

gems-uff · Sep 7, 2019 · c11d9a7 · c11d9a7
1 parent 82102d0
commit c11d9a7
Show file tree

Hide file tree

Showing 26 changed files with 178 additions and 148 deletions.
diff --git a/.idea/db-mining.iml b/.idea/db-mining.iml
diff --git a/.idea/webResources.xml b/.idea/webResources.xml
diff --git a/_config.yml b/_config.yml
@@ -1,4 +1,3 @@
-
 title: DB Mining
 description: On the usage of Databases in Open Source Projects
 theme: jekyll-theme-cayman
diff --git a/collection/__init__.py b/collection/__init__.py
diff --git a/extraction/extract.py b/extraction/extract.py
diff --git a/extraction/patterns.txt b/extraction/patterns.txt
diff --git a/heuristics/database/Derby.txt → resources/heuristics/database/Derby.txt b/heuristics/database/Derby.txt → resources/heuristics/database/Derby.txt
diff --git a/heuristics/database/H2.txt → resources/heuristics/database/H2.txt b/heuristics/database/H2.txt → resources/heuristics/database/H2.txt
diff --git a/heuristics/database/HyperSQL.txt → resources/heuristics/database/HyperSQL.txt b/heuristics/database/HyperSQL.txt → resources/heuristics/database/HyperSQL.txt
diff --git a/heuristics/database/MySQL.txt → resources/heuristics/database/MySQL.txt b/heuristics/database/MySQL.txt → resources/heuristics/database/MySQL.txt
diff --git a/heuristics/database/Oracle.txt → resources/heuristics/database/Oracle.txt b/heuristics/database/Oracle.txt → resources/heuristics/database/Oracle.txt
diff --git a/heuristics/database/PostgreSQL.txt → resources/heuristics/database/PostgreSQL.txt b/heuristics/database/PostgreSQL.txt → resources/heuristics/database/PostgreSQL.txt
diff --git a/heuristics/implementation/jOOQ.txt → resources/heuristics/implementation/jOOQ.txt b/heuristics/implementation/jOOQ.txt → resources/heuristics/implementation/jOOQ.txt
diff --git a/heuristics/query/SQL.txt → resources/heuristics/query/SQL.txt b/heuristics/query/SQL.txt → resources/heuristics/query/SQL.txt
diff --git a/collection/analyze.ipynb → src/analyze.ipynb b/collection/analyze.ipynb → src/analyze.ipynb
@@ -28,8 +28,9 @@
     }
    ],
    "source": [
+    "from util import ANNOTATED_FILE\n",
     "#reads projects from Excel file\n",
-    "df = pd.read_excel('../docs/annotated.xlsx', keep_default_na=False)\n",
+    "df = pd.read_excel(ANNOTATED_FILE, keep_default_na=False)\n",
     "len(df)"
    ]
   },
@@ -6990,8 +6991,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.3"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/collection/collect.py → src/collect.py b/collection/collect.py → src/collect.py
@@ -6,21 +6,20 @@
 import pandas as pd
 import requests
 
+from util import PROJECTS_FILE
+
 # Minimum number of stars
 MIN_STARS = 1000
 
 # Maximum number of stars (None for no maximum limit)
 MAX_STARS = None
 
-# File to load/save the data
-FILE = '../resources/projects.xlsx'
-
 
 def load():
     repositories = dict()
-    print(f'Loading repositories from {FILE}...', end=' ')
+    print(f'Loading repositories from {PROJECTS_FILE}...', end=' ')
     try:
-        df = pd.read_excel(FILE, keep_default_na=False)
+        df = pd.read_excel(PROJECTS_FILE, keep_default_na=False)
         for i, row in df.iterrows():
             repo = row.to_dict()
             repositories[repo['owner'] + '/' + repo['name']] = repo
@@ -32,14 +31,14 @@ def load():
 
 def save(repositories):
     repositories.update(load())
-    print(f'Saving repositories to {FILE}...', end=' ')
+    print(f'Saving repositories to {PROJECTS_FILE}...', end=' ')
     df = pd.DataFrame(repositories.values())
     df.loc[df.description.str.contains('(?i)\\bmirror\\b',
                                        na=False), 'isMirror'] = True  # Check 'mirror' in the description
     df.createdAt = pd.to_datetime(df.createdAt, infer_datetime_format=True).dt.tz_localize(None)
     df.pushedAt = pd.to_datetime(df.pushedAt, infer_datetime_format=True).dt.tz_localize(None)
     df.sort_values('stargazers', ascending=False, inplace=True)
-    df.to_excel(FILE, index=False)
+    df.to_excel(PROJECTS_FILE, index=False)
     print('Done!')
 
 

diff --git a/utils/database.py → src/database.py b/utils/database.py → src/database.py
@@ -1,8 +1,7 @@
 import os.path
 import sqlite3
 
-DB_FILENAME = '../resources/db-mining.db'
-DB_SCRIPT = '../resources/create-database.sql'
+from util import DATABASE_FILE, SCHEMA_FILE
 
 db_conn = None  # Connection to the database
 database_types_dict = None  # dictionary with database types
@@ -148,20 +147,18 @@ def load_existing_data():
 def connect():
     global db_conn
 
-    db_path = os.path.abspath(DB_FILENAME)
-    new_db = not os.path.exists(db_path)
-    db_conn = sqlite3.connect(db_path)
+    new_db = not os.path.exists(DATABASE_FILE)
+    db_conn = sqlite3.connect(DATABASE_FILE)
     if new_db:
         print('Creating Database...')
-        script_path = os.path.abspath(DB_SCRIPT)
-        f = open(script_path, 'r')
-        sqlFile = f.read()
+        f = open(SCHEMA_FILE, 'r')
+        sql_file = f.read()
         f.close()
 
         # all SQL commands (split on ';')
-        sqlCommands = sqlFile.split(';')
+        sql_commands = sql_file.split(';')
         # Execute every command from the input file
-        for command in sqlCommands:
+        for command in sql_commands:
             db_conn.execute(command)
         db_conn.commit()
 
@@ -464,6 +461,7 @@ def delete_project_by_id(project_id):
     if project_key in projects_set:
         projects_set.remove(project_key)  # removes project from set of processed projects
 
+
 def delete_project_version_by_id(project_version_id):
     global projects_versions_dict
     sql = 'DELETE FROM project_version WHERE project_version_id = ?'

diff --git a/collection/download.py → src/download.py b/collection/download.py → src/download.py
@@ -3,16 +3,12 @@
 
 import pandas as pd
 
-# File to load the data with repositories
-REPO_FILE = '../resources/annotated.xlsx'
-
-# Dir to clone/update repositories
-REPO_DIR = os.path.abspath('../repos')
+from util import ANNOTATED_FILE, REPOS_DIR
 
 
 def main():
-    print(f'Loading repositories from {REPO_FILE}.')
-    df = pd.read_excel(REPO_FILE, keep_default_na=False)
+    print(f'Loading repositories from {ANNOTATED_FILE}.')
+    df = pd.read_excel(ANNOTATED_FILE, keep_default_na=False)
 
     print('Removing discarded repositories.')
     df = df[df.discardReason == '']
@@ -23,7 +19,7 @@ def main():
     for i, row in df.iterrows():
         print(f'Processing repository {row["owner"]}/{row["name"]}.')
         source = f'https://github.com/{row["owner"]}/{row["name"]}.git'
-        target = REPO_DIR + os.sep + row['owner'] + os.sep + row['name']
+        target = REPOS_DIR + os.sep + row['owner'] + os.sep + row['name']
 
         if os.path.isdir(target):
             os.chdir(target)

diff --git a/src/extract.py b/src/extract.py
@@ -0,0 +1,83 @@
+import os
+import subprocess
+from collections import namedtuple
+
+import pandas as pd
+
+from util import ANNOTATED_FILE, REPOS_DIR, HEURISTICS_DIR
+from util import bold, green, red
+
+# Git grep command
+GREP_COMMAND = [
+    'git',
+    'grep',
+    '-I',
+    '--context=5',
+    '--break',
+    '--heading',
+    '--line-number',
+    '--color=always',
+    '--perl-regexp',
+    '-f'
+]
+
+
+def load_heuristics(directory):
+    Heuristic = namedtuple('Heuristic', ['type', 'label', 'file'])
+    heuristics = list()
+    for label_type in os.scandir(directory):
+        if label_type.is_dir():
+            for label in os.scandir(label_type.path):
+                if label.is_file():
+                    heuristics.append(Heuristic(label_type.name, os.path.splitext(label.name)[0], label.path))
+    return heuristics
+
+
+def main():
+    print(f'Loading repositories from {ANNOTATED_FILE}.')
+    repos_df = pd.read_excel(ANNOTATED_FILE, keep_default_na=False)
+    repos_df = repos_df[repos_df.discardReason == ''].reset_index(drop=True)
+
+    print(f'Loading heuristics from {HEURISTICS_DIR}.')
+    heuristics = load_heuristics(HEURISTICS_DIR)
+
+    print(f'Processing {len(heuristics)} heuristics over {len(repos_df)} repositories.')
+    i = 0
+    for heuristic in heuristics:
+
+        # TODO: check if the heuristic already exists in the DB.
+
+        print(f'Processing heuristic for {bold(heuristic.label)}.')
+        for j, repo in repos_df.iterrows():
+            progress = '{:.2%}'.format((i * len(repos_df) + j) / (len(heuristics) * len(repos_df)))
+            print(f'\t[{progress}] Repository {repo["owner"]}/{repo["name"]}:', end=' ')
+
+            # TODO: check if the heuristic has already been executed over the project.
+
+            repo = REPOS_DIR + os.sep + repo['owner'] + os.sep + repo['name']
+            if os.path.isdir(repo):
+                os.chdir(repo)
+                process = subprocess.run(GREP_COMMAND + [heuristic.file], text=True, capture_output=True)
+
+                if process.stderr:
+                    print(red('error.'))
+                    print(process.stderr)
+                    exit(1)
+                else:
+
+                    # TODO: save process.stdout
+
+                    print(green('ok.'))
+            else:
+                print(red('not found.'))
+
+        i += 1
+
+    print('Deleting missing heuristics...')
+    # TODO: remove from the DB the heuristics that were removed from the directory.
+
+    print("\nFinished.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/collection/filter.ipynb → src/filter.ipynb b/collection/filter.ipynb → src/filter.ipynb
@@ -28,8 +28,9 @@
     }
    ],
    "source": [
+    "from util import PROJECTS_FILE, FILTERED_FILE\n",
     "#reads projects from Excel file\n",
-    "df = pd.read_excel('../docs/projects.xlsx', keep_default_na=False)\n",
+    "df = pd.read_excel(PROJECTS_FILE, keep_default_na=False)\n",
     "len(df)"
    ]
   },
@@ -2136,7 +2137,7 @@
     "#removes timezone from dates, since Excel does not know how to handle that\n",
     "df.createdAt = pd.to_datetime(df.createdAt).dt.tz_localize(None) \n",
     "df.pushedAt = pd.to_datetime(df.pushedAt).dt.tz_localize(None)\n",
-    "df.to_excel('../docs/filtered.xlsx', index=False)"
+    "df.to_excel(FILTERED_FILE, index=False)"
    ]
   },
   {
@@ -2997,8 +2998,17 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.3"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}