Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a contentprovider for Software Heritage persistent ID (SWHID) #988

Merged
merged 3 commits into from
Jan 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Using ``repo2docker``
follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
a `DOI <https://en.wikipedia.org/wiki/Digital_object_identifier>`_ from Zenodo or Figshare,
a `Handle <https://en.wikipedia.org/wiki/Handle_System>`_ or DOI from a Dataverse installation,
a `SWHID`_ of a directory of a revision archived in the
`Software Heritage Archive <https://archive.softwareheritage.org>`_,
or a path to a local directory.

It then performs these steps:
Expand All @@ -36,7 +38,8 @@ repo2docker is called with this command::
where ``<source-repository>`` is:

* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
* a Zenodo DOI (``10.5281/zenodo.1211089``), or
* a Zenodo DOI (``10.5281/zenodo.1211089``),
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
* a path to a local directory (``a/local/directory``)

of the source repository you want to build.
Expand Down Expand Up @@ -132,3 +135,4 @@ Command line API


.. _Pytudes: https://github.com/norvig/pytudes
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
32 changes: 24 additions & 8 deletions repo2docker/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def _default_log_level(self):
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Hydroshare,
contentproviders.Swhid,
contentproviders.Mercurial,
contentproviders.Git,
],
Expand Down Expand Up @@ -269,6 +270,18 @@ def _user_name_default(self):
allow_none=True,
)

swh_token = Unicode(
None,
help="""
Token to use authenticated SWH API access.

If unset, default to unauthenticated (limited) usage of the Software
Heritage API.
""",
config=True,
allow_none=True,
)

cleanup_checkout = Bool(
False,
help="""
Expand Down Expand Up @@ -395,26 +408,29 @@ def fetch(self, url, ref, checkout_path):
"No matching content provider found for " "{url}.".format(url=url)
)

swh_token = self.config.get("swh_token", self.swh_token)
if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
picked_content_provider.set_auth_token(swh_token)

for log_line in picked_content_provider.fetch(
spec, checkout_path, yield_output=self.json_logs
):
self.log.info(log_line, extra=dict(phase="fetching"))

if not self.output_image_spec:
self.output_image_spec = (
"r2d" + escapism.escape(self.repo, escape_char="-").lower()
)
image_spec = "r2d" + self.repo
# if we are building from a subdirectory include that in the
# image name so we can tell builds from different sub-directories
# apart.
if self.subdir:
self.output_image_spec += escapism.escape(
self.subdir, escape_char="-"
).lower()
image_spec += self.subdir
if picked_content_provider.content_id is not None:
self.output_image_spec += picked_content_provider.content_id
image_spec += picked_content_provider.content_id
else:
self.output_image_spec += str(int(time.time()))
image_spec += str(int(time.time()))
self.output_image_spec = escapism.escape(
image_spec, escape_char="-"
).lower()
betatim marked this conversation as resolved.
Show resolved Hide resolved

def json_excepthook(self, etype, evalue, traceback):
"""Called on an uncaught exception when using json logging
Expand Down
1 change: 1 addition & 0 deletions repo2docker/contentproviders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .dataverse import Dataverse
from .hydroshare import Hydroshare
from .mercurial import Mercurial
from .swhid import Swhid
113 changes: 113 additions & 0 deletions repo2docker/contentproviders/swhid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import io
import os
import shutil
import tarfile
import time
import re

from os import path

import requests
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We currently don't depend on requests and have used the standard library urllib to make HTTP requests. We should take a moment to review if we want to take on the additional maintenance cost of a new dependency vs sticking with using urllib.


from .base import ContentProvider
from ..utils import copytree
from .. import __version__


def parse_swhid(swhid):
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
# only parse/check the <identifier_core> of the swhid
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
m = re.match(swhid_regexp, swhid.split(";")[0])
if m:
return m.groupdict()


class Swhid(ContentProvider):
"""Provide contents of a repository identified by a SWHID."""

retry_delay = 5

def __init__(self):
self.swhid = None
self.base_url = "https://archive.softwareheritage.org/api/1"
self.session = requests.Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)

def set_auth_token(self, token):
header = {"Authorization": "Bearer {}".format(token)}
self.session.headers.update(header)

def _request(self, url, method="GET"):
if not url.endswith("/"):
url = url + "/"

for retries in range(3):
try:
resp = self.session.request(method, url)
if resp.ok:
break
except requests.ConnectionError:
time.sleep(self.retry_delay)

return resp

@property
def content_id(self):
"""The SWHID record ID used for content retrival"""
return self.swhid

def detect(self, swhid, ref=None, extra_args=None):
swhid_dict = parse_swhid(swhid)

if (
swhid_dict
and swhid_dict["type"] in ("dir", "rev")
and swhid_dict["version"] == "1"
):
return {"swhid": swhid, "swhid_obj": swhid_dict}

def fetch_directory(self, dir_hash, output_dir):
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
yield "Fetching directory {} from {}\n".format(dir_hash, url)
resp = self._request(url, "POST")
receipt = resp.json()
status = receipt["status"]
assert status != "failed", receipt
while status not in ("failed", "done"):
time.sleep(self.retry_delay)
resp = self._request(url)
status = resp.json()["status"]
if status == "failed":
yield "Error preparing the directory for download"
raise Exception()
resp = self._request(resp.json()["fetch_url"])
archive = tarfile.open(fileobj=io.BytesIO(resp.content))
archive.extractall(path=output_dir)
# the output_dir should have only one subdir named after the dir_hash
# move its content one level up
copytree(path.join(output_dir, dir_hash), output_dir)
shutil.rmtree(path.join(output_dir, dir_hash))
yield "Fetched files: {}\n".format(os.listdir(output_dir))

def fetch(self, spec, output_dir, yield_output=False):
swhid = spec["swhid"]
swhid_obj = spec["swhid_obj"]

if swhid_obj["type"] == "rev":
# need to get the directory for this revision
sha1git = swhid_obj["hash"]
url = "{}/revision/{}/".format(self.base_url, sha1git)
yield "Fetching revision {} from {}\n".format(sha1git, url)
resp = self._request(url)
assert resp.ok, (resp.content, self.session.headers)
directory = resp.json()["directory"]
self.swhid = "swh:1:dir:{}".format(directory)
yield from self.fetch_directory(directory, output_dir)
elif swhid_obj["type"] == "dir":
self.swhid = swhid
yield from self.fetch_directory(swhid_obj["hash"], output_dir)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def get_identifier(json):
"ruamel.yaml>=0.15",
"toml",
"semver",
"requests",
],
python_requires=">=3.6",
author="Project Jupyter Contributors",
Expand Down
157 changes: 157 additions & 0 deletions tests/unit/contentproviders/test_swhid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import json
import os
import io
import tarfile
import shutil
import re
import urllib
import pytest
import tempfile
import logging
import requests_mock

from os import makedirs
from os.path import join
from unittest.mock import patch, MagicMock, mock_open
from zipfile import ZipFile

from repo2docker.contentproviders.swhid import Swhid, parse_swhid
from repo2docker.contentproviders.base import ContentProviderException


# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
# We do not use this later to prevent having to depend on swh.model[cli]
def swhid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return swhid(DIRECTORY, object)


def test_content_id():
swhid = Swhid()
assert swhid.content_id is None


swhids_ok = [
"swh:1:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 40,
]
swhids_invalid = [
"swh:1:dir:" + "0" * 39,
"swh:2:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 41,
"swh:1:cnt:" + "0" * 40,
"swh:1:ori:" + "0" * 40,
"swh:1:rel:" + "0" * 40,
"swh:1:snp:" + "0" * 40,
]

detect_values = [
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
] + [(swhid, None) for swhid in swhids_invalid]


@pytest.mark.parametrize("swhid, expected", detect_values)
def test_detect(swhid, expected):
provider = Swhid()
assert provider.detect(swhid) == expected


def fake_urlopen(req):
print(req)
return req.headers


def test_unresolving_swhid():
provider = Swhid()

# swhid = "0" * 40
# assert provider.swhid2url(swhid) is swhid


NULLID = "0" * 40


@pytest.fixture
def gen_tarfile(tmpdir):
rootdir = join(tmpdir, "tmp")
makedirs(rootdir)
with open(join(rootdir, "file1.txt"), "wb") as fobj:
fobj.write(b"Some content\n")

# this directory hash can be computed using the swh.model package, but we do
# nto want to depend on this later to limit dependencies and because it
# does not support python 3.6;
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
buf = io.BytesIO()
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
tarf.add(rootdir, arcname=dirhash)
tarf.close()
shutil.rmtree(rootdir)
return dirhash, buf.getvalue()


def mocked_provider(tmpdir, dirhash, tarfile_buf):
provider = Swhid()
adapter = requests_mock.Adapter()
provider.base_url = "mock://api/1"
provider.retry_delay = 0.1
provider.session.mount("mock://", adapter)

adapter.register_uri(
"GET",
"mock://api/1/revision/{}/".format(NULLID),
json={
"author": {"fullname": "John Doe <[email protected]>"},
"directory": dirhash,
},
)
adapter.register_uri(
"POST",
"mock://api/1/vault/directory/{}/".format(dirhash),
json={
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "new",
},
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/".format(dirhash),
[
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "pending",
}
},
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "done",
}
},
],
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/raw/".format(dirhash),
content=tarfile_buf,
)
return provider


def test_fetch_revision(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:rev:" + NULLID
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == "swh:1:dir:" + dir_id


def test_fetch_directory(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:dir:" + dir_id
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == swhid