Skip to content

Commit

Permalink
New scrape-dependents command, refs #34
Browse files Browse the repository at this point in the history
  • Loading branch information
simonw committed Apr 30, 2020
1 parent c34d5a1 commit c9f4840
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 2 deletions.
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# github-to-sqlite

[![PyPI](https://img.shields.io/pypi/v/github-to-sqlite.svg)](https://pypi.org/project/github-to-sqlite/)
[![Changelog](https://img.shields.io/github/v/release/dogsheep/github-to-sqlite?include_prereleases&label=changelog)](https://github.com/dogsheep/github-to-sqlite/releases)
[![CircleCI](https://circleci.com/gh/dogsheep/github-to-sqlite.svg?style=svg)](https://circleci.com/gh/dogsheep/github-to-sqlite)
[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/dogsheep/github-to-sqlite/blob/master/LICENSE)

Expand Down Expand Up @@ -95,3 +96,15 @@ The `starred` command fetches the repos that have been starred by a user.
$ github-to-sqlite starred github.db simonw

If you are using an `auth.json` file you can omit the username to retrieve the starred repos for the authenticated user.

## Scraping dependents for a repository

The GitHub dependency graph can show other GitHub projects that depend on a specific repo, for example [simonw/datasette/network/dependents](https://github.com/simonw/datasette/network/dependents).

This data is not yet available through the GitHub API. The `scrape-dependents` command scrapes those pages and uses the GitHub API to load full versions of the dependent repositories.

$ github-to-sqlite scrape-dependents github.db simonw/datasette

The command accepts one or more repositories.

Add `-v` for verbose output.
64 changes: 64 additions & 0 deletions github_to_sqlite/cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import click
import datetime
import pathlib
import os
import sqlite_utils
Expand Down Expand Up @@ -268,6 +269,69 @@ def stop_when(commit):
utils.ensure_fts(db)


@cli.command(name="scrape-dependents")
@click.argument(
"db_path",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
required=True,
)
@click.argument("repos", type=str, nargs=-1)
@click.option(
"-a",
"--auth",
type=click.Path(file_okay=True, dir_okay=False, allow_dash=True),
default="auth.json",
help="Path to auth.json token file",
)
@click.option(
"-v", "--verbose", is_flag=True, help="Verbose output",
)
def scrape_dependents(db_path, repos, auth, verbose):
"Scrape dependents for specified repos"
try:
import bs4
except ImportError:
raise click.ClickException("Optional dependency bs4 is needed for this command")
db = sqlite_utils.Database(db_path)
token = load_token(auth)

for repo in repos:
repo_full = utils.fetch_repo(repo, token)
utils.save_repo(db, repo_full)

for dependent_repo in utils.scrape_dependents(repo, verbose):
# Don't fetch repo details if it's already in our DB
existing = list(db["repos"].rows_where("full_name = ?", [dependent_repo]))
dependent_id = None
if not existing:
dependent_full = utils.fetch_repo(dependent_repo, token)
time.sleep(1)
utils.save_repo(db, dependent_full)
dependent_id = dependent_full["id"]
else:
dependent_id = existing[0]["id"]
# Only insert if it isn't already there:
if not db["dependents"].exists() or not list(
db["dependents"].rows_where(
"repo = ? and dependent = ?", [repo_full["id"], dependent_id]
)
):
db["dependents"].insert(
{
"repo": repo_full["id"],
"dependent": dependent_id,
"first_seen_utc": datetime.datetime.utcnow().isoformat(),
},
pk=("repo", "dependent"),
foreign_keys=(
("repo", "repos", "id"),
("dependent", "repos", "id"),
),
)

utils.ensure_fts(db)


def load_token(auth):
try:
token = json.load(open(auth))["github_personal_token"]
Expand Down
34 changes: 33 additions & 1 deletion github_to_sqlite/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import requests
import time


class GitHubError(Exception):
Expand Down Expand Up @@ -156,7 +157,9 @@ def fetch_repo(full_name, token=None):
headers["Accept"] = "application/vnd.github.mercy-preview+json"
owner, slug = full_name.split("/")
url = "https://api.github.com/repos/{}/{}".format(owner, slug)
return requests.get(url, headers=headers).json()
response = requests.get(url, headers=headers)
response.raise_for_status()
return response.json()


def save_repo(db, repo):
Expand Down Expand Up @@ -447,3 +450,32 @@ def ensure_fts(db):
if table not in existing_tables:
continue
db[table].enable_fts(columns, create_triggers=True)


def scrape_dependents(repo, verbose=False):
# Optional dependency:
from bs4 import BeautifulSoup

url = "https://github.com/{}/network/dependents".format(repo)
while url:
if verbose:
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
repos = [
a["href"].lstrip("/")
for a in soup.select("a[data-hovercard-type=repository]")
]
if verbose:
print(repos)
yield from repos
# next page?
try:
next_link = soup.select(".paginate-container")[0].find("a", text="Next")
except IndexError:
break
if next_link is not None:
url = next_link["href"]
time.sleep(1)
else:
url = None
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ def get_long_description():
github-to-sqlite=github_to_sqlite.cli:cli
""",
install_requires=["sqlite-utils>=2.7", "requests"],
extras_require={"test": ["pytest"]},
extras_require={"test": ["pytest", "requests-mock", "bs4"]},
tests_require=["github-to-sqlite[test]"],
)
51 changes: 51 additions & 0 deletions tests/test_scrape_dependents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from github_to_sqlite import cli
from click.testing import CliRunner
import json
import sqlite_utils
import pathlib

REPO = json.load(open(pathlib.Path(__file__).parent / "repo.json"))


def test_scrape_dependents(requests_mock):
requests_mock.get(
"https://github.com/dogsheep/github-to-sqlite/network/dependents",
text="""
<a data-hovercard-type="repository" href="/simonw/foo">
<a data-hovercard-type="repository" href="/simonw/bar">
<div class="paginate-container">
<a href="https://github.com/dogsheep/github-to-sqlite/network/dependents?dependents_after=abc">Next</a>
</div>
""",
)
requests_mock.get(
"https://github.com/dogsheep/github-to-sqlite/network/dependents?dependents_after=abc",
text="""
<a data-hovercard-type="repository" href="/simonw/baz">
""",
)
requests_mock.get(
"https://api.github.com/repos/dogsheep/github-to-sqlite", json=REPO
)
requests_mock.get(
"https://api.github.com/repos/simonw/foo", json=dict(REPO, id=1),
)
requests_mock.get(
"https://api.github.com/repos/simonw/bar", json=dict(REPO, id=2),
)
requests_mock.get(
"https://api.github.com/repos/simonw/baz", json=dict(REPO, id=3),
)
runner = CliRunner()
with runner.isolated_filesystem():
result = runner.invoke(
cli.cli, ["scrape-dependents", "scrape.db", "dogsheep/github-to-sqlite"]
)
assert 0 == result.exit_code
db = sqlite_utils.Database("scrape.db")
assert {"repos", "dependents"}.issubset(db.table_names())
assert {1, 2, 3, 207052882} == set(
r[0] for r in db.conn.execute("select id from repos").fetchall()
)
pairs = [(r["repo"], r["dependent"]) for r in db["dependents"].rows]
assert [(207052882, 1), (207052882, 2), (207052882, 3)] == pairs

0 comments on commit c9f4840

Please sign in to comment.