diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 6643038e8..bdac9b835 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -14,6 +14,8 @@ Using ``repo2docker`` follows :ref:`specification`. repo2docker is called with the URL of a Git repository, a `DOI `_ from Zenodo or Figshare, a `Handle `_ or DOI from a Dataverse installation, +a `SWHID`_ of a directory of a revision archived in the +`Software Heritage Archive `_, or a path to a local directory. It then performs these steps: @@ -36,7 +38,8 @@ repo2docker is called with this command:: where ```` is: * a URL of a Git repository (``https://github.com/binder-examples/requirements``), - * a Zenodo DOI (``10.5281/zenodo.1211089``), or + * a Zenodo DOI (``10.5281/zenodo.1211089``), + * a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or * a path to a local directory (``a/local/directory``) of the source repository you want to build. @@ -132,3 +135,4 @@ Command line API .. _Pytudes: https://github.com/norvig/pytudes +.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html diff --git a/repo2docker/app.py b/repo2docker/app.py index 937e74793..5b553712e 100755 --- a/repo2docker/app.py +++ b/repo2docker/app.py @@ -148,6 +148,7 @@ def _default_log_level(self): contentproviders.Figshare, contentproviders.Dataverse, contentproviders.Hydroshare, + contentproviders.Swhid, contentproviders.Mercurial, contentproviders.Git, ], @@ -269,6 +270,18 @@ def _user_name_default(self): allow_none=True, ) + swh_token = Unicode( + None, + help=""" + Token to use authenticated SWH API access. + + If unset, default to unauthenticated (limited) usage of the Software + Heritage API. + """, + config=True, + allow_none=True, + ) + cleanup_checkout = Bool( False, help=""" @@ -395,26 +408,29 @@ def fetch(self, url, ref, checkout_path): "No matching content provider found for " "{url}.".format(url=url) ) + swh_token = self.config.get("swh_token", self.swh_token) + if swh_token and isinstance(picked_content_provider, contentproviders.Swhid): + picked_content_provider.set_auth_token(swh_token) + for log_line in picked_content_provider.fetch( spec, checkout_path, yield_output=self.json_logs ): self.log.info(log_line, extra=dict(phase="fetching")) if not self.output_image_spec: - self.output_image_spec = ( - "r2d" + escapism.escape(self.repo, escape_char="-").lower() - ) + image_spec = "r2d" + self.repo # if we are building from a subdirectory include that in the # image name so we can tell builds from different sub-directories # apart. if self.subdir: - self.output_image_spec += escapism.escape( - self.subdir, escape_char="-" - ).lower() + image_spec += self.subdir if picked_content_provider.content_id is not None: - self.output_image_spec += picked_content_provider.content_id + image_spec += picked_content_provider.content_id else: - self.output_image_spec += str(int(time.time())) + image_spec += str(int(time.time())) + self.output_image_spec = escapism.escape( + image_spec, escape_char="-" + ).lower() def json_excepthook(self, etype, evalue, traceback): """Called on an uncaught exception when using json logging diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py index ae0b8c27c..6398c233e 100755 --- a/repo2docker/contentproviders/__init__.py +++ b/repo2docker/contentproviders/__init__.py @@ -5,3 +5,4 @@ from .dataverse import Dataverse from .hydroshare import Hydroshare from .mercurial import Mercurial +from .swhid import Swhid diff --git a/repo2docker/contentproviders/swhid.py b/repo2docker/contentproviders/swhid.py new file mode 100644 index 000000000..e20501770 --- /dev/null +++ b/repo2docker/contentproviders/swhid.py @@ -0,0 +1,113 @@ +import io +import os +import shutil +import tarfile +import time +import re + +from os import path + +import requests + +from .base import ContentProvider +from ..utils import copytree +from .. import __version__ + + +def parse_swhid(swhid): + swhid_regexp = r"^swh:(?P\d+):(?Pori|cnt|rev|dir|snp|rel):(?P[0-9a-f]{40})$" + # only parse/check the of the swhid + # see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html + m = re.match(swhid_regexp, swhid.split(";")[0]) + if m: + return m.groupdict() + + +class Swhid(ContentProvider): + """Provide contents of a repository identified by a SWHID.""" + + retry_delay = 5 + + def __init__(self): + self.swhid = None + self.base_url = "https://archive.softwareheritage.org/api/1" + self.session = requests.Session() + self.session.headers.update( + { + "user-agent": "repo2docker {}".format(__version__), + } + ) + + def set_auth_token(self, token): + header = {"Authorization": "Bearer {}".format(token)} + self.session.headers.update(header) + + def _request(self, url, method="GET"): + if not url.endswith("/"): + url = url + "/" + + for retries in range(3): + try: + resp = self.session.request(method, url) + if resp.ok: + break + except requests.ConnectionError: + time.sleep(self.retry_delay) + + return resp + + @property + def content_id(self): + """The SWHID record ID used for content retrival""" + return self.swhid + + def detect(self, swhid, ref=None, extra_args=None): + swhid_dict = parse_swhid(swhid) + + if ( + swhid_dict + and swhid_dict["type"] in ("dir", "rev") + and swhid_dict["version"] == "1" + ): + return {"swhid": swhid, "swhid_obj": swhid_dict} + + def fetch_directory(self, dir_hash, output_dir): + url = "{}/vault/directory/{}/".format(self.base_url, dir_hash) + yield "Fetching directory {} from {}\n".format(dir_hash, url) + resp = self._request(url, "POST") + receipt = resp.json() + status = receipt["status"] + assert status != "failed", receipt + while status not in ("failed", "done"): + time.sleep(self.retry_delay) + resp = self._request(url) + status = resp.json()["status"] + if status == "failed": + yield "Error preparing the directory for download" + raise Exception() + resp = self._request(resp.json()["fetch_url"]) + archive = tarfile.open(fileobj=io.BytesIO(resp.content)) + archive.extractall(path=output_dir) + # the output_dir should have only one subdir named after the dir_hash + # move its content one level up + copytree(path.join(output_dir, dir_hash), output_dir) + shutil.rmtree(path.join(output_dir, dir_hash)) + yield "Fetched files: {}\n".format(os.listdir(output_dir)) + + def fetch(self, spec, output_dir, yield_output=False): + swhid = spec["swhid"] + swhid_obj = spec["swhid_obj"] + + if swhid_obj["type"] == "rev": + # need to get the directory for this revision + sha1git = swhid_obj["hash"] + url = "{}/revision/{}/".format(self.base_url, sha1git) + yield "Fetching revision {} from {}\n".format(sha1git, url) + resp = self._request(url) + assert resp.ok, (resp.content, self.session.headers) + directory = resp.json()["directory"] + self.swhid = "swh:1:dir:{}".format(directory) + yield from self.fetch_directory(directory, output_dir) + elif swhid_obj["type"] == "dir": + self.swhid = swhid + yield from self.fetch_directory(swhid_obj["hash"], output_dir) diff --git a/setup.py b/setup.py index 8bfe64ebc..dab829d70 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ def get_identifier(json): "ruamel.yaml>=0.15", "toml", "semver", + "requests", ], python_requires=">=3.6", author="Project Jupyter Contributors", diff --git a/tests/unit/contentproviders/test_swhid.py b/tests/unit/contentproviders/test_swhid.py new file mode 100644 index 000000000..953218e3c --- /dev/null +++ b/tests/unit/contentproviders/test_swhid.py @@ -0,0 +1,157 @@ +import json +import os +import io +import tarfile +import shutil +import re +import urllib +import pytest +import tempfile +import logging +import requests_mock + +from os import makedirs +from os.path import join +from unittest.mock import patch, MagicMock, mock_open +from zipfile import ZipFile + +from repo2docker.contentproviders.swhid import Swhid, parse_swhid +from repo2docker.contentproviders.base import ContentProviderException + + +# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir(). +# We do not use this later to prevent having to depend on swh.model[cli] +def swhid_of_dir(path): + object = Directory.from_disk(path=path).get_data() + return swhid(DIRECTORY, object) + + +def test_content_id(): + swhid = Swhid() + assert swhid.content_id is None + + +swhids_ok = [ + "swh:1:dir:" + "0" * 40, + "swh:1:rev:" + "0" * 40, +] +swhids_invalid = [ + "swh:1:dir:" + "0" * 39, + "swh:2:dir:" + "0" * 40, + "swh:1:rev:" + "0" * 41, + "swh:1:cnt:" + "0" * 40, + "swh:1:ori:" + "0" * 40, + "swh:1:rel:" + "0" * 40, + "swh:1:snp:" + "0" * 40, +] + +detect_values = [ + (swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok +] + [(swhid, None) for swhid in swhids_invalid] + + +@pytest.mark.parametrize("swhid, expected", detect_values) +def test_detect(swhid, expected): + provider = Swhid() + assert provider.detect(swhid) == expected + + +def fake_urlopen(req): + print(req) + return req.headers + + +def test_unresolving_swhid(): + provider = Swhid() + + # swhid = "0" * 40 + # assert provider.swhid2url(swhid) is swhid + + +NULLID = "0" * 40 + + +@pytest.fixture +def gen_tarfile(tmpdir): + rootdir = join(tmpdir, "tmp") + makedirs(rootdir) + with open(join(rootdir, "file1.txt"), "wb") as fobj: + fobj.write(b"Some content\n") + + # this directory hash can be computed using the swh.model package, but we do + # nto want to depend on this later to limit dependencies and because it + # does not support python 3.6; + dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe" + buf = io.BytesIO() + tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w") + tarf.add(rootdir, arcname=dirhash) + tarf.close() + shutil.rmtree(rootdir) + return dirhash, buf.getvalue() + + +def mocked_provider(tmpdir, dirhash, tarfile_buf): + provider = Swhid() + adapter = requests_mock.Adapter() + provider.base_url = "mock://api/1" + provider.retry_delay = 0.1 + provider.session.mount("mock://", adapter) + + adapter.register_uri( + "GET", + "mock://api/1/revision/{}/".format(NULLID), + json={ + "author": {"fullname": "John Doe "}, + "directory": dirhash, + }, + ) + adapter.register_uri( + "POST", + "mock://api/1/vault/directory/{}/".format(dirhash), + json={ + "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), + "status": "new", + }, + ) + adapter.register_uri( + "GET", + "mock://api/1/vault/directory/{}/".format(dirhash), + [ + { + "json": { + "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), + "status": "pending", + } + }, + { + "json": { + "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), + "status": "done", + } + }, + ], + ) + adapter.register_uri( + "GET", + "mock://api/1/vault/directory/{}/raw/".format(dirhash), + content=tarfile_buf, + ) + return provider + + +def test_fetch_revision(tmpdir, gen_tarfile): + dir_id, tarfile_buf = gen_tarfile + provider = mocked_provider(tmpdir, dir_id, tarfile_buf) + swhid = "swh:1:rev:" + NULLID + for log in provider.fetch(provider.detect(swhid), tmpdir): + print(log) + assert provider.content_id == "swh:1:dir:" + dir_id + + +def test_fetch_directory(tmpdir, gen_tarfile): + dir_id, tarfile_buf = gen_tarfile + provider = mocked_provider(tmpdir, dir_id, tarfile_buf) + swhid = "swh:1:dir:" + dir_id + for log in provider.fetch(provider.detect(swhid), tmpdir): + print(log) + assert provider.content_id == swhid