New scrape-dependents command, refs #34

dogsheep · Apr 30, 2020 · c9f4840 · c9f4840
1 parent c34d5a1
commit c9f4840
Show file tree

Hide file tree

Showing 5 changed files with 162 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # github-to-sqlite
 
 [![PyPI](https://img.shields.io/pypi/v/github-to-sqlite.svg)](https://pypi.org/project/github-to-sqlite/)
+[![Changelog](https://img.shields.io/github/v/release/dogsheep/github-to-sqlite?include_prereleases&label=changelog)](https://github.com/dogsheep/github-to-sqlite/releases)
 [![CircleCI](https://circleci.com/gh/dogsheep/github-to-sqlite.svg?style=svg)](https://circleci.com/gh/dogsheep/github-to-sqlite)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/dogsheep/github-to-sqlite/blob/master/LICENSE)
 
@@ -95,3 +96,15 @@ The `starred` command fetches the repos that have been starred by a user.
  $ github-to-sqlite starred github.db simonw
 
 If you are using an `auth.json` file you can omit the username to retrieve the starred repos for the authenticated user.
+
+## Scraping dependents for a repository
+
+The GitHub dependency graph can show other GitHub projects that depend on a specific repo, for example [simonw/datasette/network/dependents](https://github.com/simonw/datasette/network/dependents).
+
+This data is not yet available through the GitHub API. The `scrape-dependents` command scrapes those pages and uses the GitHub API to load full versions of the dependent repositories.
+
+ $ github-to-sqlite scrape-dependents github.db simonw/datasette
+
+The command accepts one or more repositories.
+
+Add `-v` for verbose output.
diff --git a/github_to_sqlite/cli.py b/github_to_sqlite/cli.py
@@ -1,4 +1,5 @@
 import click
+import datetime
 import pathlib
 import os
 import sqlite_utils
@@ -268,6 +269,69 @@ def stop_when(commit):
  utils.ensure_fts(db)
 
 
+@cli.command(name="scrape-dependents")
+@click.argument(
+ "db_path",
+ type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
+ required=True,
+)
+@click.argument("repos", type=str, nargs=-1)
+@click.option(
+ "-a",
+ "--auth",
+ type=click.Path(file_okay=True, dir_okay=False, allow_dash=True),
+ default="auth.json",
+ help="Path to auth.json token file",
+)
+@click.option(
+ "-v", "--verbose", is_flag=True, help="Verbose output",
+)
+def scrape_dependents(db_path, repos, auth, verbose):
+ "Scrape dependents for specified repos"
+ try:
+ import bs4
+ except ImportError:
+ raise click.ClickException("Optional dependency bs4 is needed for this command")
+ db = sqlite_utils.Database(db_path)
+ token = load_token(auth)
+
+ for repo in repos:
+ repo_full = utils.fetch_repo(repo, token)
+ utils.save_repo(db, repo_full)
+
+ for dependent_repo in utils.scrape_dependents(repo, verbose):
+ # Don't fetch repo details if it's already in our DB
+ existing = list(db["repos"].rows_where("full_name = ?", [dependent_repo]))
+ dependent_id = None
+ if not existing:
+ dependent_full = utils.fetch_repo(dependent_repo, token)
+ time.sleep(1)
+ utils.save_repo(db, dependent_full)
+ dependent_id = dependent_full["id"]
+ else:
+ dependent_id = existing[0]["id"]
+ # Only insert if it isn't already there:
+ if not db["dependents"].exists() or not list(
+ db["dependents"].rows_where(
+ "repo = ? and dependent = ?", [repo_full["id"], dependent_id]
+ )
+ ):
+ db["dependents"].insert(
+ {
+ "repo": repo_full["id"],
+ "dependent": dependent_id,
+ "first_seen_utc": datetime.datetime.utcnow().isoformat(),
+ },
+ pk=("repo", "dependent"),
+ foreign_keys=(
+ ("repo", "repos", "id"),
+ ("dependent", "repos", "id"),
+ ),
+ )
+
+ utils.ensure_fts(db)
+
+
 def load_token(auth):
  try:
  token = json.load(open(auth))["github_personal_token"]

diff --git a/github_to_sqlite/utils.py b/github_to_sqlite/utils.py
@@ -1,4 +1,5 @@
 import requests
+import time
 
 
 class GitHubError(Exception):
@@ -156,7 +157,9 @@ def fetch_repo(full_name, token=None):
  headers["Accept"] = "application/vnd.github.mercy-preview+json"
  owner, slug = full_name.split("/")
  url = "https://api.github.com/repos/{}/{}".format(owner, slug)
- return requests.get(url, headers=headers).json()
+ response = requests.get(url, headers=headers)
+ response.raise_for_status()
+ return response.json()
 
 
 def save_repo(db, repo):
@@ -447,3 +450,32 @@ def ensure_fts(db):
  if table not in existing_tables:
  continue
  db[table].enable_fts(columns, create_triggers=True)
+
+
+def scrape_dependents(repo, verbose=False):
+ # Optional dependency:
+ from bs4 import BeautifulSoup
+
+ url = "https://github.com/{}/network/dependents".format(repo)
+ while url:
+ if verbose:
+ print(url)
+ response = requests.get(url)
+ soup = BeautifulSoup(response.content, "html.parser")
+ repos = [
+ a["href"].lstrip("/")
+ for a in soup.select("a[data-hovercard-type=repository]")
+ ]
+ if verbose:
+ print(repos)
+ yield from repos
+ # next page?
+ try:
+ next_link = soup.select(".paginate-container")[0].find("a", text="Next")
+ except IndexError:
+ break
+ if next_link is not None:
+ url = next_link["href"]
+ time.sleep(1)
+ else:
+ url = None
diff --git a/setup.py b/setup.py
@@ -27,6 +27,6 @@ def get_long_description():
  github-to-sqlite=github_to_sqlite.cli:cli
  """,
  install_requires=["sqlite-utils>=2.7", "requests"],
- extras_require={"test": ["pytest"]},
+ extras_require={"test": ["pytest", "requests-mock", "bs4"]},
  tests_require=["github-to-sqlite[test]"],
 )
diff --git a/tests/test_scrape_dependents.py b/tests/test_scrape_dependents.py
@@ -0,0 +1,51 @@
+from github_to_sqlite import cli
+from click.testing import CliRunner
+import json
+import sqlite_utils
+import pathlib
+
+REPO = json.load(open(pathlib.Path(__file__).parent / "repo.json"))
+
+
+def test_scrape_dependents(requests_mock):
+ requests_mock.get(
+ "https://github.com/dogsheep/github-to-sqlite/network/dependents",
+ text="""
+ <a data-hovercard-type="repository" href="/simonw/foo">
+ <a data-hovercard-type="repository" href="/simonw/bar">
+ <div class="paginate-container">
+ <a href="https://github.com/dogsheep/github-to-sqlite/network/dependents?dependents_after=abc">Next</a>
+ </div>
+ """,
+ )
+ requests_mock.get(
+ "https://github.com/dogsheep/github-to-sqlite/network/dependents?dependents_after=abc",
+ text="""
+ <a data-hovercard-type="repository" href="/simonw/baz">
+ """,
+ )
+ requests_mock.get(
+ "https://api.github.com/repos/dogsheep/github-to-sqlite", json=REPO
+ )
+ requests_mock.get(
+ "https://api.github.com/repos/simonw/foo", json=dict(REPO, id=1),
+ )
+ requests_mock.get(
+ "https://api.github.com/repos/simonw/bar", json=dict(REPO, id=2),
+ )
+ requests_mock.get(
+ "https://api.github.com/repos/simonw/baz", json=dict(REPO, id=3),
+ )
+ runner = CliRunner()
+ with runner.isolated_filesystem():
+ result = runner.invoke(
+ cli.cli, ["scrape-dependents", "scrape.db", "dogsheep/github-to-sqlite"]
+ )
+ assert 0 == result.exit_code
+ db = sqlite_utils.Database("scrape.db")
+ assert {"repos", "dependents"}.issubset(db.table_names())
+ assert {1, 2, 3, 207052882} == set(
+ r[0] for r in db.conn.execute("select id from repos").fetchall()
+ )
+ pairs = [(r["repo"], r["dependent"]) for r in db["dependents"].rows]
+ assert [(207052882, 1), (207052882, 2), (207052882, 3)] == pairs