Merge pull request #988 from douardda/swhid

jupyterhub · Jan 26, 2021 · 1140dd1 · 1140dd1
2 parents ac41c20 + 5f26710
commit 1140dd1
Show file tree

Hide file tree

Showing 6 changed files with 301 additions and 9 deletions.
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -14,6 +14,8 @@ Using ``repo2docker``
 follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
 a `DOI  <https://en.wikipedia.org/wiki/Digital_object_identifier>`_ from Zenodo or Figshare,
 a `Handle <https://en.wikipedia.org/wiki/Handle_System>`_ or DOI from a Dataverse installation,
+a `SWHID`_ of a directory of a revision archived in the
+`Software Heritage Archive <https://archive.softwareheritage.org>`_,
 or a path to a local directory.
 
 It then performs these steps:
@@ -36,7 +38,8 @@ repo2docker is called with this command::
 where ``<source-repository>`` is:
 
   * a URL of a Git repository (``https://github.com/binder-examples/requirements``),
-  * a Zenodo DOI (``10.5281/zenodo.1211089``), or
+  * a Zenodo DOI (``10.5281/zenodo.1211089``),
+  * a SWHID_ (``swh:1:rev:999dd06c7f679a2714dfe5199bdca09522a29649``), or
   * a path to a local directory (``a/local/directory``)
 
 of the source repository you want to build.
@@ -132,3 +135,4 @@ Command line API
 
 
 .. _Pytudes: https://github.com/norvig/pytudes
+.. _SWHID: https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
diff --git a/repo2docker/app.py b/repo2docker/app.py
@@ -148,6 +148,7 @@ def _default_log_level(self):
             contentproviders.Figshare,
             contentproviders.Dataverse,
             contentproviders.Hydroshare,
+            contentproviders.Swhid,
             contentproviders.Mercurial,
             contentproviders.Git,
         ],
@@ -269,6 +270,18 @@ def _user_name_default(self):
         allow_none=True,
     )
 
+    swh_token = Unicode(
+        None,
+        help="""
+        Token to use authenticated SWH API access.
+
+        If unset, default to unauthenticated (limited) usage of the Software
+        Heritage API.
+        """,
+        config=True,
+        allow_none=True,
+    )
+
     cleanup_checkout = Bool(
         False,
         help="""
@@ -395,26 +408,29 @@ def fetch(self, url, ref, checkout_path):
                 "No matching content provider found for " "{url}.".format(url=url)
             )
 
+        swh_token = self.config.get("swh_token", self.swh_token)
+        if swh_token and isinstance(picked_content_provider, contentproviders.Swhid):
+            picked_content_provider.set_auth_token(swh_token)
+
         for log_line in picked_content_provider.fetch(
             spec, checkout_path, yield_output=self.json_logs
         ):
             self.log.info(log_line, extra=dict(phase="fetching"))
 
         if not self.output_image_spec:
-            self.output_image_spec = (
-                "r2d" + escapism.escape(self.repo, escape_char="-").lower()
-            )
+            image_spec = "r2d" + self.repo
             # if we are building from a subdirectory include that in the
             # image name so we can tell builds from different sub-directories
             # apart.
             if self.subdir:
-                self.output_image_spec += escapism.escape(
-                    self.subdir, escape_char="-"
-                ).lower()
+                image_spec += self.subdir
             if picked_content_provider.content_id is not None:
-                self.output_image_spec += picked_content_provider.content_id
+                image_spec += picked_content_provider.content_id
             else:
-                self.output_image_spec += str(int(time.time()))
+                image_spec += str(int(time.time()))
+            self.output_image_spec = escapism.escape(
+                image_spec, escape_char="-"
+            ).lower()
 
     def json_excepthook(self, etype, evalue, traceback):
         """Called on an uncaught exception when using json logging

diff --git a/repo2docker/contentproviders/__init__.py b/repo2docker/contentproviders/__init__.py
@@ -5,3 +5,4 @@
 from .dataverse import Dataverse
 from .hydroshare import Hydroshare
 from .mercurial import Mercurial
+from .swhid import Swhid
diff --git a/repo2docker/contentproviders/swhid.py b/repo2docker/contentproviders/swhid.py
@@ -0,0 +1,113 @@
+import io
+import os
+import shutil
+import tarfile
+import time
+import re
+
+from os import path
+
+import requests
+
+from .base import ContentProvider
+from ..utils import copytree
+from .. import __version__
+
+
+def parse_swhid(swhid):
+    swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$"
+    # only parse/check the <identifier_core> of the swhid
+    # see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html
+    m = re.match(swhid_regexp, swhid.split(";")[0])
+    if m:
+        return m.groupdict()
+
+
+class Swhid(ContentProvider):
+    """Provide contents of a repository identified by a SWHID."""
+
+    retry_delay = 5
+
+    def __init__(self):
+        self.swhid = None
+        self.base_url = "https://archive.softwareheritage.org/api/1"
+        self.session = requests.Session()
+        self.session.headers.update(
+            {
+                "user-agent": "repo2docker {}".format(__version__),
+            }
+        )
+
+    def set_auth_token(self, token):
+        header = {"Authorization": "Bearer {}".format(token)}
+        self.session.headers.update(header)
+
+    def _request(self, url, method="GET"):
+        if not url.endswith("/"):
+            url = url + "/"
+
+        for retries in range(3):
+            try:
+                resp = self.session.request(method, url)
+                if resp.ok:
+                    break
+            except requests.ConnectionError:
+                time.sleep(self.retry_delay)
+
+        return resp
+
+    @property
+    def content_id(self):
+        """The SWHID record ID used for content retrival"""
+        return self.swhid
+
+    def detect(self, swhid, ref=None, extra_args=None):
+        swhid_dict = parse_swhid(swhid)
+
+        if (
+            swhid_dict
+            and swhid_dict["type"] in ("dir", "rev")
+            and swhid_dict["version"] == "1"
+        ):
+            return {"swhid": swhid, "swhid_obj": swhid_dict}
+
+    def fetch_directory(self, dir_hash, output_dir):
+        url = "{}/vault/directory/{}/".format(self.base_url, dir_hash)
+        yield "Fetching directory {} from {}\n".format(dir_hash, url)
+        resp = self._request(url, "POST")
+        receipt = resp.json()
+        status = receipt["status"]
+        assert status != "failed", receipt
+        while status not in ("failed", "done"):
+            time.sleep(self.retry_delay)
+            resp = self._request(url)
+            status = resp.json()["status"]
+        if status == "failed":
+            yield "Error preparing the directory for download"
+            raise Exception()
+        resp = self._request(resp.json()["fetch_url"])
+        archive = tarfile.open(fileobj=io.BytesIO(resp.content))
+        archive.extractall(path=output_dir)
+        # the output_dir should have only one subdir named after the dir_hash
+        # move its content one level up
+        copytree(path.join(output_dir, dir_hash), output_dir)
+        shutil.rmtree(path.join(output_dir, dir_hash))
+        yield "Fetched files: {}\n".format(os.listdir(output_dir))
+
+    def fetch(self, spec, output_dir, yield_output=False):
+        swhid = spec["swhid"]
+        swhid_obj = spec["swhid_obj"]
+
+        if swhid_obj["type"] == "rev":
+            # need to get the directory for this revision
+            sha1git = swhid_obj["hash"]
+            url = "{}/revision/{}/".format(self.base_url, sha1git)
+            yield "Fetching revision {} from {}\n".format(sha1git, url)
+            resp = self._request(url)
+            assert resp.ok, (resp.content, self.session.headers)
+            directory = resp.json()["directory"]
+            self.swhid = "swh:1:dir:{}".format(directory)
+            yield from self.fetch_directory(directory, output_dir)
+        elif swhid_obj["type"] == "dir":
+            self.swhid = swhid
+            yield from self.fetch_directory(swhid_obj["hash"], output_dir)
diff --git a/setup.py b/setup.py
@@ -56,6 +56,7 @@ def get_identifier(json):
         "ruamel.yaml>=0.15",
         "toml",
         "semver",
+        "requests",
     ],
     python_requires=">=3.6",
     author="Project Jupyter Contributors",

diff --git a/tests/unit/contentproviders/test_swhid.py b/tests/unit/contentproviders/test_swhid.py
@@ -0,0 +1,157 @@
+import json
+import os
+import io
+import tarfile
+import shutil
+import re
+import urllib
+import pytest
+import tempfile
+import logging
+import requests_mock
+
+from os import makedirs
+from os.path import join
+from unittest.mock import patch, MagicMock, mock_open
+from zipfile import ZipFile
+
+from repo2docker.contentproviders.swhid import Swhid, parse_swhid
+from repo2docker.contentproviders.base import ContentProviderException
+
+
+# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir().
+# We do not use this later to prevent having to depend on swh.model[cli]
+def swhid_of_dir(path):
+    object = Directory.from_disk(path=path).get_data()
+    return swhid(DIRECTORY, object)
+
+
+def test_content_id():
+    swhid = Swhid()
+    assert swhid.content_id is None
+
+
+swhids_ok = [
+    "swh:1:dir:" + "0" * 40,
+    "swh:1:rev:" + "0" * 40,
+]
+swhids_invalid = [
+    "swh:1:dir:" + "0" * 39,
+    "swh:2:dir:" + "0" * 40,
+    "swh:1:rev:" + "0" * 41,
+    "swh:1:cnt:" + "0" * 40,
+    "swh:1:ori:" + "0" * 40,
+    "swh:1:rel:" + "0" * 40,
+    "swh:1:snp:" + "0" * 40,
+]
+
+detect_values = [
+    (swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok
+] + [(swhid, None) for swhid in swhids_invalid]
+
+
+@pytest.mark.parametrize("swhid, expected", detect_values)
+def test_detect(swhid, expected):
+    provider = Swhid()
+    assert provider.detect(swhid) == expected
+
+
+def fake_urlopen(req):
+    print(req)
+    return req.headers
+
+
+def test_unresolving_swhid():
+    provider = Swhid()
+
+    # swhid = "0" * 40
+    # assert provider.swhid2url(swhid) is swhid
+
+
+NULLID = "0" * 40
+
+
+@pytest.fixture
+def gen_tarfile(tmpdir):
+    rootdir = join(tmpdir, "tmp")
+    makedirs(rootdir)
+    with open(join(rootdir, "file1.txt"), "wb") as fobj:
+        fobj.write(b"Some content\n")
+
+    # this directory hash can be computed using the swh.model package, but we do
+    # nto want to depend on this later to limit dependencies and because it
+    # does not support python 3.6;
+    dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe"
+    buf = io.BytesIO()
+    tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w")
+    tarf.add(rootdir, arcname=dirhash)
+    tarf.close()
+    shutil.rmtree(rootdir)
+    return dirhash, buf.getvalue()
+
+
+def mocked_provider(tmpdir, dirhash, tarfile_buf):
+    provider = Swhid()
+    adapter = requests_mock.Adapter()
+    provider.base_url = "mock://api/1"
+    provider.retry_delay = 0.1
+    provider.session.mount("mock://", adapter)
+
+    adapter.register_uri(
+        "GET",
+        "mock://api/1/revision/{}/".format(NULLID),
+        json={
+            "author": {"fullname": "John Doe <[email protected]>"},
+            "directory": dirhash,
+        },
+    )
+    adapter.register_uri(
+        "POST",
+        "mock://api/1/vault/directory/{}/".format(dirhash),
+        json={
+            "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
+            "status": "new",
+        },
+    )
+    adapter.register_uri(
+        "GET",
+        "mock://api/1/vault/directory/{}/".format(dirhash),
+        [
+            {
+                "json": {
+                    "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
+                    "status": "pending",
+                }
+            },
+            {
+                "json": {
+                    "fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash),
+                    "status": "done",
+                }
+            },
+        ],
+    )
+    adapter.register_uri(
+        "GET",
+        "mock://api/1/vault/directory/{}/raw/".format(dirhash),
+        content=tarfile_buf,
+    )
+    return provider
+
+
+def test_fetch_revision(tmpdir, gen_tarfile):
+    dir_id, tarfile_buf = gen_tarfile
+    provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
+    swhid = "swh:1:rev:" + NULLID
+    for log in provider.fetch(provider.detect(swhid), tmpdir):
+        print(log)
+    assert provider.content_id == "swh:1:dir:" + dir_id
+
+
+def test_fetch_directory(tmpdir, gen_tarfile):
+    dir_id, tarfile_buf = gen_tarfile
+    provider = mocked_provider(tmpdir, dir_id, tarfile_buf)
+    swhid = "swh:1:dir:" + dir_id
+    for log in provider.fetch(provider.detect(swhid), tmpdir):
+        print(log)
+    assert provider.content_id == swhid