Skip to content

Commit

Permalink
Merge pull request #988 from douardda/swhid
Browse files Browse the repository at this point in the history
  • Loading branch information
betatim authored Jan 26, 2021
2 parents ac41c20 + 5f26710 commit 1140dd1
Show file tree
Hide file tree
Showing 6 changed files with 301 additions and 9 deletions.
6 changes: 5 additions & 1 deletion docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Using ``repo2docker``
follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
a `DOI <https://en.wikipedia.org/wiki/Digital_object_identifier>`_ from Zenodo or Figshare,
a `Handle <https://en.wikipedia.org/wiki/Handle_System>`_ or DOI from a Dataverse installation,
a `SWHID`_ of a directory of a revision archived in the
`Software Heritage Archive <https://archive.softwareheritage.org>`_,
or a path to a local directory.

It then performs these steps:
Expand All @@ -36,7 +38,8 @@ repo2docker is called with this command::
where ``<source-repository>`` is:

* a URL of a Git repository (``https://github.com/binder-examples/requirements``),
* a Zenodo DOI (``10.5281/zenodo.1211089``), or
* a Zenodo DOI (``10.5281/zenodo.1211089``),
* a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
* a path to a local directory (``a/local/directory``)

of the source repository you want to build.
Expand Down Expand Up @@ -132,3 +135,4 @@ Command line API


.. _Pytudes: https://github.com/norvig/pytudes
.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
32 changes: 24 additions & 8 deletions repo2docker/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def _default_log_level(self):
contentproviders.Figshare,
contentproviders.Dataverse,
contentproviders.Hydroshare,
contentproviders.Swhid,
contentproviders.Mercurial,
contentproviders.Git,
],
Expand Down Expand Up @@ -269,6 +270,18 @@ def _user_name_default(self):
allow_none=True,
)

swh_token = Unicode(
None,
help="""
Token to use authenticated SWH API access.
If unset, default to unauthenticated (limited) usage of the Software
Heritage API.
""",
config=True,
allow_none=True,
)

cleanup_checkout = Bool(
False,
help="""
Expand Down Expand Up @@ -395,26 +408,29 @@ def fetch(self, url, ref, checkout_path):
"No matching content provider found for " "{url}.".format(url=url)
)

swh_token = self.config.get("swh_token", self.swh_token)
if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
picked_content_provider.set_auth_token(swh_token)

for log_line in picked_content_provider.fetch(
spec, checkout_path, yield_output=self.json_logs
):
self.log.info(log_line, extra=dict(phase="fetching"))

if not self.output_image_spec:
self.output_image_spec = (
"r2d" + escapism.escape(self.repo, escape_char="-").lower()
)
image_spec = "r2d" + self.repo
# if we are building from a subdirectory include that in the
# image name so we can tell builds from different sub-directories
# apart.
if self.subdir:
self.output_image_spec += escapism.escape(
self.subdir, escape_char="-"
).lower()
image_spec += self.subdir
if picked_content_provider.content_id is not None:
self.output_image_spec += picked_content_provider.content_id
image_spec += picked_content_provider.content_id
else:
self.output_image_spec += str(int(time.time()))
image_spec += str(int(time.time()))
self.output_image_spec = escapism.escape(
image_spec, escape_char="-"
).lower()

def json_excepthook(self, etype, evalue, traceback):
"""Called on an uncaught exception when using json logging
Expand Down
1 change: 1 addition & 0 deletions repo2docker/contentproviders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .dataverse import Dataverse
from .hydroshare import Hydroshare
from .mercurial import Mercurial
from .swhid import Swhid
113 changes: 113 additions & 0 deletions repo2docker/contentproviders/swhid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import io
import os
import shutil
import tarfile
import time
import re

from os import path

import requests

from .base import ContentProvider
from ..utils import copytree
from .. import __version__


def parse_swhid(swhid):
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
# only parse/check the <identifier_core> of the swhid
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
m = re.match(swhid_regexp, swhid.split(";")[0])
if m:
return m.groupdict()


class Swhid(ContentProvider):
"""Provide contents of a repository identified by a SWHID."""

retry_delay = 5

def __init__(self):
self.swhid = None
self.base_url = "https://archive.softwareheritage.org/api/1"
self.session = requests.Session()
self.session.headers.update(
{
"user-agent": "repo2docker {}".format(__version__),
}
)

def set_auth_token(self, token):
header = {"Authorization": "Bearer {}".format(token)}
self.session.headers.update(header)

def _request(self, url, method="GET"):
if not url.endswith("/"):
url = url + "/"

for retries in range(3):
try:
resp = self.session.request(method, url)
if resp.ok:
break
except requests.ConnectionError:
time.sleep(self.retry_delay)

return resp

@property
def content_id(self):
"""The SWHID record ID used for content retrival"""
return self.swhid

def detect(self, swhid, ref=None, extra_args=None):
swhid_dict = parse_swhid(swhid)

if (
swhid_dict
and swhid_dict["type"] in ("dir", "rev")
and swhid_dict["version"] == "1"
):
return {"swhid": swhid, "swhid_obj": swhid_dict}

def fetch_directory(self, dir_hash, output_dir):
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
yield "Fetching directory {} from {}\n".format(dir_hash, url)
resp = self._request(url, "POST")
receipt = resp.json()
status = receipt["status"]
assert status != "failed", receipt
while status not in ("failed", "done"):
time.sleep(self.retry_delay)
resp = self._request(url)
status = resp.json()["status"]
if status == "failed":
yield "Error preparing the directory for download"
raise Exception()
resp = self._request(resp.json()["fetch_url"])
archive = tarfile.open(fileobj=io.BytesIO(resp.content))
archive.extractall(path=output_dir)
# the output_dir should have only one subdir named after the dir_hash
# move its content one level up
copytree(path.join(output_dir, dir_hash), output_dir)
shutil.rmtree(path.join(output_dir, dir_hash))
yield "Fetched files: {}\n".format(os.listdir(output_dir))

def fetch(self, spec, output_dir, yield_output=False):
swhid = spec["swhid"]
swhid_obj = spec["swhid_obj"]

if swhid_obj["type"] == "rev":
# need to get the directory for this revision
sha1git = swhid_obj["hash"]
url = "{}/revision/{}/".format(self.base_url, sha1git)
yield "Fetching revision {} from {}\n".format(sha1git, url)
resp = self._request(url)
assert resp.ok, (resp.content, self.session.headers)
directory = resp.json()["directory"]
self.swhid = "swh:1:dir:{}".format(directory)
yield from self.fetch_directory(directory, output_dir)
elif swhid_obj["type"] == "dir":
self.swhid = swhid
yield from self.fetch_directory(swhid_obj["hash"], output_dir)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def get_identifier(json):
"ruamel.yaml>=0.15",
"toml",
"semver",
"requests",
],
python_requires=">=3.6",
author="Project Jupyter Contributors",
Expand Down
157 changes: 157 additions & 0 deletions tests/unit/contentproviders/test_swhid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import json
import os
import io
import tarfile
import shutil
import re
import urllib
import pytest
import tempfile
import logging
import requests_mock

from os import makedirs
from os.path import join
from unittest.mock import patch, MagicMock, mock_open
from zipfile import ZipFile

from repo2docker.contentproviders.swhid import Swhid, parse_swhid
from repo2docker.contentproviders.base import ContentProviderException


# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
# We do not use this later to prevent having to depend on swh.model[cli]
def swhid_of_dir(path):
object = Directory.from_disk(path=path).get_data()
return swhid(DIRECTORY, object)


def test_content_id():
swhid = Swhid()
assert swhid.content_id is None


swhids_ok = [
"swh:1:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 40,
]
swhids_invalid = [
"swh:1:dir:" + "0" * 39,
"swh:2:dir:" + "0" * 40,
"swh:1:rev:" + "0" * 41,
"swh:1:cnt:" + "0" * 40,
"swh:1:ori:" + "0" * 40,
"swh:1:rel:" + "0" * 40,
"swh:1:snp:" + "0" * 40,
]

detect_values = [
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
] + [(swhid, None) for swhid in swhids_invalid]


@pytest.mark.parametrize("swhid, expected", detect_values)
def test_detect(swhid, expected):
provider = Swhid()
assert provider.detect(swhid) == expected


def fake_urlopen(req):
print(req)
return req.headers


def test_unresolving_swhid():
provider = Swhid()

# swhid = "0" * 40
# assert provider.swhid2url(swhid) is swhid


NULLID = "0" * 40


@pytest.fixture
def gen_tarfile(tmpdir):
rootdir = join(tmpdir, "tmp")
makedirs(rootdir)
with open(join(rootdir, "file1.txt"), "wb") as fobj:
fobj.write(b"Some content\n")

# this directory hash can be computed using the swh.model package, but we do
# nto want to depend on this later to limit dependencies and because it
# does not support python 3.6;
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
buf = io.BytesIO()
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
tarf.add(rootdir, arcname=dirhash)
tarf.close()
shutil.rmtree(rootdir)
return dirhash, buf.getvalue()


def mocked_provider(tmpdir, dirhash, tarfile_buf):
provider = Swhid()
adapter = requests_mock.Adapter()
provider.base_url = "mock://api/1"
provider.retry_delay = 0.1
provider.session.mount("mock://", adapter)

adapter.register_uri(
"GET",
"mock://api/1/revision/{}/".format(NULLID),
json={
"author": {"fullname": "John Doe <[email protected]>"},
"directory": dirhash,
},
)
adapter.register_uri(
"POST",
"mock://api/1/vault/directory/{}/".format(dirhash),
json={
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "new",
},
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/".format(dirhash),
[
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "pending",
}
},
{
"json": {
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
"status": "done",
}
},
],
)
adapter.register_uri(
"GET",
"mock://api/1/vault/directory/{}/raw/".format(dirhash),
content=tarfile_buf,
)
return provider


def test_fetch_revision(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:rev:" + NULLID
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == "swh:1:dir:" + dir_id


def test_fetch_directory(tmpdir, gen_tarfile):
dir_id, tarfile_buf = gen_tarfile
provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
swhid = "swh:1:dir:" + dir_id
for log in provider.fetch(provider.detect(swhid), tmpdir):
print(log)
assert provider.content_id == swhid

0 comments on commit 1140dd1

Please sign in to comment.