-
Notifications
You must be signed in to change notification settings - Fork 366
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #988 from douardda/swhid
- Loading branch information
Showing
6 changed files
with
301 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import io | ||
import os | ||
import shutil | ||
import tarfile | ||
import time | ||
import re | ||
|
||
from os import path | ||
|
||
import requests | ||
|
||
from .base import ContentProvider | ||
from ..utils import copytree | ||
from .. import __version__ | ||
|
||
|
||
def parse_swhid(swhid): | ||
swhid_regexp = r"^swh:(?P<version>\d+):(?P<type>ori|cnt|rev|dir|snp|rel):(?P<hash>[0-9a-f]{40})$" | ||
# only parse/check the <identifier_core> of the swhid | ||
# see https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html | ||
m = re.match(swhid_regexp, swhid.split(";")[0]) | ||
if m: | ||
return m.groupdict() | ||
|
||
|
||
class Swhid(ContentProvider): | ||
"""Provide contents of a repository identified by a SWHID.""" | ||
|
||
retry_delay = 5 | ||
|
||
def __init__(self): | ||
self.swhid = None | ||
self.base_url = "https://archive.softwareheritage.org/api/1" | ||
self.session = requests.Session() | ||
self.session.headers.update( | ||
{ | ||
"user-agent": "repo2docker {}".format(__version__), | ||
} | ||
) | ||
|
||
def set_auth_token(self, token): | ||
header = {"Authorization": "Bearer {}".format(token)} | ||
self.session.headers.update(header) | ||
|
||
def _request(self, url, method="GET"): | ||
if not url.endswith("/"): | ||
url = url + "/" | ||
|
||
for retries in range(3): | ||
try: | ||
resp = self.session.request(method, url) | ||
if resp.ok: | ||
break | ||
except requests.ConnectionError: | ||
time.sleep(self.retry_delay) | ||
|
||
return resp | ||
|
||
@property | ||
def content_id(self): | ||
"""The SWHID record ID used for content retrival""" | ||
return self.swhid | ||
|
||
def detect(self, swhid, ref=None, extra_args=None): | ||
swhid_dict = parse_swhid(swhid) | ||
|
||
if ( | ||
swhid_dict | ||
and swhid_dict["type"] in ("dir", "rev") | ||
and swhid_dict["version"] == "1" | ||
): | ||
return {"swhid": swhid, "swhid_obj": swhid_dict} | ||
|
||
def fetch_directory(self, dir_hash, output_dir): | ||
url = "{}/vault/directory/{}/".format(self.base_url, dir_hash) | ||
yield "Fetching directory {} from {}\n".format(dir_hash, url) | ||
resp = self._request(url, "POST") | ||
receipt = resp.json() | ||
status = receipt["status"] | ||
assert status != "failed", receipt | ||
while status not in ("failed", "done"): | ||
time.sleep(self.retry_delay) | ||
resp = self._request(url) | ||
status = resp.json()["status"] | ||
if status == "failed": | ||
yield "Error preparing the directory for download" | ||
raise Exception() | ||
resp = self._request(resp.json()["fetch_url"]) | ||
archive = tarfile.open(fileobj=io.BytesIO(resp.content)) | ||
archive.extractall(path=output_dir) | ||
# the output_dir should have only one subdir named after the dir_hash | ||
# move its content one level up | ||
copytree(path.join(output_dir, dir_hash), output_dir) | ||
shutil.rmtree(path.join(output_dir, dir_hash)) | ||
yield "Fetched files: {}\n".format(os.listdir(output_dir)) | ||
|
||
def fetch(self, spec, output_dir, yield_output=False): | ||
swhid = spec["swhid"] | ||
swhid_obj = spec["swhid_obj"] | ||
|
||
if swhid_obj["type"] == "rev": | ||
# need to get the directory for this revision | ||
sha1git = swhid_obj["hash"] | ||
url = "{}/revision/{}/".format(self.base_url, sha1git) | ||
yield "Fetching revision {} from {}\n".format(sha1git, url) | ||
resp = self._request(url) | ||
assert resp.ok, (resp.content, self.session.headers) | ||
directory = resp.json()["directory"] | ||
self.swhid = "swh:1:dir:{}".format(directory) | ||
yield from self.fetch_directory(directory, output_dir) | ||
elif swhid_obj["type"] == "dir": | ||
self.swhid = swhid | ||
yield from self.fetch_directory(swhid_obj["hash"], output_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import json | ||
import os | ||
import io | ||
import tarfile | ||
import shutil | ||
import re | ||
import urllib | ||
import pytest | ||
import tempfile | ||
import logging | ||
import requests_mock | ||
|
||
from os import makedirs | ||
from os.path import join | ||
from unittest.mock import patch, MagicMock, mock_open | ||
from zipfile import ZipFile | ||
|
||
from repo2docker.contentproviders.swhid import Swhid, parse_swhid | ||
from repo2docker.contentproviders.base import ContentProviderException | ||
|
||
|
||
# this is a slightly stripped down copy of swh.model.cli.swhid_of_dir(). | ||
# We do not use this later to prevent having to depend on swh.model[cli] | ||
def swhid_of_dir(path): | ||
object = Directory.from_disk(path=path).get_data() | ||
return swhid(DIRECTORY, object) | ||
|
||
|
||
def test_content_id(): | ||
swhid = Swhid() | ||
assert swhid.content_id is None | ||
|
||
|
||
swhids_ok = [ | ||
"swh:1:dir:" + "0" * 40, | ||
"swh:1:rev:" + "0" * 40, | ||
] | ||
swhids_invalid = [ | ||
"swh:1:dir:" + "0" * 39, | ||
"swh:2:dir:" + "0" * 40, | ||
"swh:1:rev:" + "0" * 41, | ||
"swh:1:cnt:" + "0" * 40, | ||
"swh:1:ori:" + "0" * 40, | ||
"swh:1:rel:" + "0" * 40, | ||
"swh:1:snp:" + "0" * 40, | ||
] | ||
|
||
detect_values = [ | ||
(swhid, {"swhid": swhid, "swhid_obj": parse_swhid(swhid)}) for swhid in swhids_ok | ||
] + [(swhid, None) for swhid in swhids_invalid] | ||
|
||
|
||
@pytest.mark.parametrize("swhid, expected", detect_values) | ||
def test_detect(swhid, expected): | ||
provider = Swhid() | ||
assert provider.detect(swhid) == expected | ||
|
||
|
||
def fake_urlopen(req): | ||
print(req) | ||
return req.headers | ||
|
||
|
||
def test_unresolving_swhid(): | ||
provider = Swhid() | ||
|
||
# swhid = "0" * 40 | ||
# assert provider.swhid2url(swhid) is swhid | ||
|
||
|
||
NULLID = "0" * 40 | ||
|
||
|
||
@pytest.fixture | ||
def gen_tarfile(tmpdir): | ||
rootdir = join(tmpdir, "tmp") | ||
makedirs(rootdir) | ||
with open(join(rootdir, "file1.txt"), "wb") as fobj: | ||
fobj.write(b"Some content\n") | ||
|
||
# this directory hash can be computed using the swh.model package, but we do | ||
# nto want to depend on this later to limit dependencies and because it | ||
# does not support python 3.6; | ||
dirhash = "89a3bd29a2c5ae0b1465febbe5df09730a8576fe" | ||
buf = io.BytesIO() | ||
tarf = tarfile.open(name=dirhash, fileobj=buf, mode="w") | ||
tarf.add(rootdir, arcname=dirhash) | ||
tarf.close() | ||
shutil.rmtree(rootdir) | ||
return dirhash, buf.getvalue() | ||
|
||
|
||
def mocked_provider(tmpdir, dirhash, tarfile_buf): | ||
provider = Swhid() | ||
adapter = requests_mock.Adapter() | ||
provider.base_url = "mock://api/1" | ||
provider.retry_delay = 0.1 | ||
provider.session.mount("mock://", adapter) | ||
|
||
adapter.register_uri( | ||
"GET", | ||
"mock://api/1/revision/{}/".format(NULLID), | ||
json={ | ||
"author": {"fullname": "John Doe <[email protected]>"}, | ||
"directory": dirhash, | ||
}, | ||
) | ||
adapter.register_uri( | ||
"POST", | ||
"mock://api/1/vault/directory/{}/".format(dirhash), | ||
json={ | ||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), | ||
"status": "new", | ||
}, | ||
) | ||
adapter.register_uri( | ||
"GET", | ||
"mock://api/1/vault/directory/{}/".format(dirhash), | ||
[ | ||
{ | ||
"json": { | ||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), | ||
"status": "pending", | ||
} | ||
}, | ||
{ | ||
"json": { | ||
"fetch_url": "mock://api/1/vault/directory/{}/raw/".format(dirhash), | ||
"status": "done", | ||
} | ||
}, | ||
], | ||
) | ||
adapter.register_uri( | ||
"GET", | ||
"mock://api/1/vault/directory/{}/raw/".format(dirhash), | ||
content=tarfile_buf, | ||
) | ||
return provider | ||
|
||
|
||
def test_fetch_revision(tmpdir, gen_tarfile): | ||
dir_id, tarfile_buf = gen_tarfile | ||
provider = mocked_provider(tmpdir, dir_id, tarfile_buf) | ||
swhid = "swh:1:rev:" + NULLID | ||
for log in provider.fetch(provider.detect(swhid), tmpdir): | ||
print(log) | ||
assert provider.content_id == "swh:1:dir:" + dir_id | ||
|
||
|
||
def test_fetch_directory(tmpdir, gen_tarfile): | ||
dir_id, tarfile_buf = gen_tarfile | ||
provider = mocked_provider(tmpdir, dir_id, tarfile_buf) | ||
swhid = "swh:1:dir:" + dir_id | ||
for log in provider.fetch(provider.detect(swhid), tmpdir): | ||
print(log) | ||
assert provider.content_id == swhid |