-
Notifications
You must be signed in to change notification settings - Fork 380
Add a contentprovider for Software Heritage persistent ID (SWHID) #988
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import io | ||
import os | ||
import shutil | ||
import tarfile | ||
import time | ||
import re | ||
|
||
from os import path | ||
|
||
import requests | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We currently don't depend on |
||
|
||
from .base import ContentProvider | ||
from ..utils import copytree | ||
from .. import __version__ | ||
|
||
|
||
def parse_swhid(swhid): | ||
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$" | ||
# only parse/check the <identifier_core> of the swhid | ||
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html | ||
m = re.match(swhid_regexp, swhid.split(";")[0]) | ||
if m: | ||
return m.groupdict() | ||
|
||
|
||
class Swhid(ContentProvider): | ||
"""Provide contents of a repository identified by a SWHID.""" | ||
|
||
retry_delay = 5 | ||
|
||
def __init__(self): | ||
self.swhid = None | ||
self.base_url = "https://archive.softwareheritage.org/api/1" | ||
self.session = requests.Session() | ||
self.session.headers.update( | ||
{ | ||
"user-agent": "repo2docker {}".format(__version__), | ||
} | ||
) | ||
|
||
def set_auth_token(self, token): | ||
header = {"Authorization": "Bearer {}".format(token)} | ||
self.session.headers.update(header) | ||
|
||
def _request(self, url, method="GET"): | ||
if not url.endswith("/"): | ||
url = url + "/" | ||
|
||
for retries in range(3): | ||
try: | ||
resp = self.session.request(method, url) | ||
if resp.ok: | ||
break | ||
except requests.ConnectionError: | ||
time.sleep(self.retry_delay) | ||
|
||
return resp | ||
|
||
@property | ||
def content_id(self): | ||
"""The SWHID record ID used for content retrival""" | ||
return self.swhid | ||
|
||
def detect(self, swhid, ref=None, extra_args=None): | ||
swhid_dict = parse_swhid(swhid) | ||
|
||
if ( | ||
swhid_dict | ||
and swhid_dict["type"] in ("dir", "rev") | ||
and swhid_dict["version"] == "1" | ||
): | ||
return {"swhid": swhid, "swhid_obj": swhid_dict} | ||
|
||
def fetch_directory(self, dir_hash, output_dir): | ||
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash) | ||
yield "Fetching directory {} from {}\n".format(dir_hash, url) | ||
resp = self._request(url, "POST") | ||
receipt = resp.json() | ||
status = receipt["status"] | ||
assert status != "failed", receipt | ||
while status not in ("failed", "done"): | ||
time.sleep(self.retry_delay) | ||
resp = self._request(url) | ||
status = resp.json()["status"] | ||
if status == "failed": | ||
yield "Error preparing the directory for download" | ||
raise Exception() | ||
resp = self._request(resp.json()["fetch_url"]) | ||
archive = tarfile.open(fileobj=io.BytesIO(resp.content)) | ||
archive.extractall(path=output_dir) | ||
# the output_dir should have only one subdir named after the dir_hash | ||
# move its content one level up | ||
copytree(path.join(output_dir, dir_hash), output_dir) | ||
shutil.rmtree(path.join(output_dir, dir_hash)) | ||
yield "Fetched files: {}\n".format(os.listdir(output_dir)) | ||
|
||
def fetch(self, spec, output_dir, yield_output=False): | ||
swhid = spec["swhid"] | ||
swhid_obj = spec["swhid_obj"] | ||
|
||
if swhid_obj["type"] == "rev": | ||
# need to get the directory for this revision | ||
sha1git = swhid_obj["hash"] | ||
url = "{}/revision/{}/".format(self.base_url, sha1git) | ||
yield "Fetching revision {} from {}\n".format(sha1git, url) | ||
resp = self._request(url) | ||
assert resp.ok, (resp.content, self.session.headers) | ||
directory = resp.json()["directory"] | ||
self.swhid = "swh:1:dir:{}".format(directory) | ||
yield from self.fetch_directory(directory, output_dir) | ||
elif swhid_obj["type"] == "dir": | ||
self.swhid = swhid | ||
yield from self.fetch_directory(swhid_obj["hash"], output_dir) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import json | ||
import os | ||
import io | ||
import tarfile | ||
import shutil | ||
import re | ||
import urllib | ||
import pytest | ||
import tempfile | ||
import logging | ||
import requests_mock | ||
|
||
from os import makedirs | ||
from os.path import join | ||
from unittest.mock import patch, MagicMock, mock_open | ||
from zipfile import ZipFile | ||
|
||
from repo2docker.contentproviders.swhid import Swhid, parse_swhid | ||
from repo2docker.contentproviders.base import ContentProviderException | ||
|
||
|
||
# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir(). | ||
# We do not use this later to prevent having to depend on swh.model[cli] | ||
def swhid_of_dir(path): | ||
object = Directory.from_disk(path=path).get_data() | ||
return swhid(DIRECTORY, object) | ||
|
||
|
||
def test_content_id(): | ||
swhid = Swhid() | ||
assert swhid.content_id is None | ||
|
||
|
||
swhids_ok = [ | ||
"swh:1:dir:" + "0" * 40, | ||
"swh:1:rev:" + "0" * 40, | ||
] | ||
swhids_invalid = [ | ||
"swh:1:dir:" + "0" * 39, | ||
"swh:2:dir:" + "0" * 40, | ||
"swh:1:rev:" + "0" * 41, | ||
"swh:1:cnt:" + "0" * 40, | ||
"swh:1:ori:" + "0" * 40, | ||
"swh:1:rel:" + "0" * 40, | ||
"swh:1:snp:" + "0" * 40, | ||
] | ||
|
||
detect_values = [ | ||
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok | ||
] + [(swhid, None) for swhid in swhids_invalid] | ||
|
||
|
||
@pytest.mark.parametrize("swhid, expected", detect_values) | ||
def test_detect(swhid, expected): | ||
provider = Swhid() | ||
assert provider.detect(swhid) == expected | ||
|
||
|
||
def fake_urlopen(req): | ||
print(req) | ||
return req.headers | ||
|
||
|
||
def test_unresolving_swhid(): | ||
provider = Swhid() | ||
|
||
# swhid = "0" * 40 | ||
# assert provider.swhid2url(swhid) is swhid | ||
|
||
|
||
NULLID = "0" * 40 | ||
|
||
|
||
@pytest.fixture | ||
def gen_tarfile(tmpdir): | ||
rootdir = join(tmpdir, "tmp") | ||
makedirs(rootdir) | ||
with open(join(rootdir, "file1.txt"), "wb") as fobj: | ||
fobj.write(b"Some content\n") | ||
|
||
# this directory hash can be computed using the swh.model package, but we do | ||
# nto want to depend on this later to limit dependencies and because it | ||
# does not support python 3.6; | ||
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe" | ||
buf = io.BytesIO() | ||
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w") | ||
tarf.add(rootdir, arcname=dirhash) | ||
tarf.close() | ||
shutil.rmtree(rootdir) | ||
return dirhash, buf.getvalue() | ||
|
||
|
||
def mocked_provider(tmpdir, dirhash, tarfile_buf): | ||
provider = Swhid() | ||
adapter = requests_mock.Adapter() | ||
provider.base_url = "mock://api/1" | ||
provider.retry_delay = 0.1 | ||
provider.session.mount("mock://", adapter) | ||
|
||
adapter.register_uri( | ||
"GET", | ||
"mock://api/1/revision/{}/".format(NULLID), | ||
json={ | ||
"author": {"fullname": "John Doe <jdoe@example.com>"}, | ||
"directory": dirhash, | ||
}, | ||
) | ||
adapter.register_uri( | ||
"POST", | ||
"mock://api/1/vault/directory/{}/".format(dirhash), | ||
json={ | ||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), | ||
"status": "new", | ||
}, | ||
) | ||
adapter.register_uri( | ||
"GET", | ||
"mock://api/1/vault/directory/{}/".format(dirhash), | ||
[ | ||
{ | ||
"json": { | ||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), | ||
"status": "pending", | ||
} | ||
}, | ||
{ | ||
"json": { | ||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), | ||
"status": "done", | ||
} | ||
}, | ||
], | ||
) | ||
adapter.register_uri( | ||
"GET", | ||
"mock://api/1/vault/directory/{}/raw/".format(dirhash), | ||
content=tarfile_buf, | ||
) | ||
return provider | ||
|
||
|
||
def test_fetch_revision(tmpdir, gen_tarfile): | ||
dir_id, tarfile_buf = gen_tarfile | ||
provider = mocked_provider(tmpdir, dir_id, tarfile_buf) | ||
swhid = "swh:1:rev:" + NULLID | ||
for log in provider.fetch(provider.detect(swhid), tmpdir): | ||
print(log) | ||
assert provider.content_id == "swh:1:dir:" + dir_id | ||
|
||
|
||
def test_fetch_directory(tmpdir, gen_tarfile): | ||
dir_id, tarfile_buf = gen_tarfile | ||
provider = mocked_provider(tmpdir, dir_id, tarfile_buf) | ||
swhid = "swh:1:dir:" + dir_id | ||
for log in provider.fetch(provider.detect(swhid), tmpdir): | ||
print(log) | ||
assert provider.content_id == swhid |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.