Skip to content

Commit

Permalink
[MRG] Add Figshare content provider (#788)
Browse files Browse the repository at this point in the history
[MRG] Add Figshare content provider
  • Loading branch information
betatim authored Sep 11, 2019
2 parents 57919b9 + a40e179 commit e33d5f8
Show file tree
Hide file tree
Showing 10 changed files with 439 additions and 156 deletions.
1 change: 0 additions & 1 deletion docs/source/contributing/roadmap.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,3 @@ time there is no active plan for an item. The project would like to find the
resources and time to discuss and then execute these ideas.
* support execution on a remote host (with more resources than available locally) via the command-line
* add support for using ZIP files as the repo (`repo2docker https://example.com/an-archive.zip`) this will give us access to several archives (like Zenodo) that expose things as ZIP files.
* add support for Zenodo (`repo2docker 10.5281/zenodo.1476680`) so Zenodo software archives can be used as the source in addition to a git repository
2 changes: 1 addition & 1 deletion docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Using ``repo2docker``

``repo2docker`` can build a reproducible computational environment for any repository that
follows :ref:`specification`. repo2docker is called with the URL of a Git repository,
a Zenodo DOI or a path to a local directory. It then
a DOI from Zenodo or Figshare, or a path to a local directory. It then
performs these steps:

1. Inspects the repository for :ref:`configuration files <config-files>`. These will be used to build
Expand Down
7 changes: 6 additions & 1 deletion repo2docker/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,12 @@ def _default_log_level(self):
# detecting if something will successfully `git clone` is very hard if all
# you can do is look at the path/URL to it.
content_providers = List(
[contentproviders.Local, contentproviders.Zenodo, contentproviders.Git],
[
contentproviders.Local,
contentproviders.Zenodo,
contentproviders.Figshare,
contentproviders.Git,
],
config=True,
help="""
Ordered list by priority of ContentProviders to try in turn to fetch
Expand Down
1 change: 1 addition & 0 deletions repo2docker/contentproviders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .git import Git
from .base import Local
from .zenodo import Zenodo
from .figshare import Figshare
90 changes: 90 additions & 0 deletions repo2docker/contentproviders/doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
import json
import shutil
import logging

from os import makedirs
from os import path
from urllib import request # urlopen, Request
from urllib.error import HTTPError
from zipfile import ZipFile, is_zipfile

from .base import ContentProvider
from ..utils import copytree, deep_get
from ..utils import normalize_doi, is_doi
from .. import __version__


class DoiProvider(ContentProvider):
"""Provide contents of a repository identified by a DOI and some helper functions."""

def urlopen(self, req, headers=None):
"""A urlopen() helper"""
# someone passed a string, not a request
if not isinstance(req, request.Request):
req = request.Request(req)

req.add_header("User-Agent", "repo2docker {}".format(__version__))
if headers is not None:
for key, value in headers.items():
req.add_header(key, value)

return request.urlopen(req)

def doi2url(self, doi):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
if is_doi(doi):
doi = normalize_doi(doi)

try:
resp = self.urlopen("https://doi.org/{}".format(doi))
# If the DOI doesn't resolve, just return URL
except HTTPError:
return doi
return resp.url
else:
# Just return what is actulally just a URL
return doi

def fetch_file(self, file_ref, host, output_dir, unzip=False):
# the assumption is that `unzip=True` means that this is the only
# file related to a record
file_url = deep_get(file_ref, host["download"])
fname = deep_get(file_ref, host["filename"])
logging.debug("Downloading file {} as {}\n".format(file_url, fname))
with self.urlopen(file_url) as src:
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)

dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
shutil.copyfileobj(src, dst)
# first close the newly written file, then continue
# processing it
if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()

# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
shutil.rmtree(sub_dir)

new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))

yield "Fetched files: {}\n".format(os.listdir(output_dir))
97 changes: 97 additions & 0 deletions repo2docker/contentproviders/figshare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os
import re
import json
import shutil

from os import makedirs
from os import path
from urllib.request import Request
from urllib.error import HTTPError
from zipfile import is_zipfile

from .doi import DoiProvider
from ..utils import copytree, deep_get


class Figshare(DoiProvider):
"""Provide contents of a Figshare article.
See https://docs.figshare.com/#public_article for API docs.
Examples:
- https://doi.org/10.6084/m9.figshare.9782777
- https://doi.org/10.6084/m9.figshare.9782777.v2
- https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI)
"""

def __init__(self):
self.hosts = [
{
"hostname": [
"https://figshare.com/articles/",
"http://figshare.com/articles/",
"https://figshare.com/account/articles/",
],
"api": "https://api.figshare.com/v2/articles/",
"filepath": "files",
"filename": "name",
"download": "download_url",
}
]

url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/)?(\d+)?")

def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Figshare article"""
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)

url = self.doi2url(doi)

for host in self.hosts:
if any([url.startswith(s) for s in host["hostname"]]):
match = self.url_regex.match(url)
if match:
self.article_id = match.groups()[2]
self.article_version = match.groups()[4]
if not self.article_version:
self.article_version = "1"
return {
"article": self.article_id,
"host": host,
"version": self.article_version,
}
else:
return None

def fetch(self, spec, output_dir, yield_output=False):
"""Fetch and unpack a Figshare article"""
article_id = spec["article"]
article_version = spec["version"]
host = spec["host"]

yield "Fetching Figshare article {} in version {}.\n".format(
article_id, article_version
)
req = Request(
"{}{}/versions/{}".format(host["api"], article_id, article_version),
headers={"accept": "application/json"},
)
resp = self.urlopen(req)

article = json.loads(resp.read().decode("utf-8"))

files = deep_get(article, host["filepath"])
# only fetch files where is_link_only: False
files = [file for file in files if not file["is_link_only"]]
only_one_file = len(files) == 1
for file_ref in files:
unzip = file_ref["name"].endswith(".zip") and only_one_file
for line in self.fetch_file(file_ref, host, output_dir, unzip):
yield line

@property
def content_id(self):
"""The Figshare article ID"""
return "{}.v{}".format(self.article_id, self.article_version)
93 changes: 13 additions & 80 deletions repo2docker/contentproviders/zenodo.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,54 +4,21 @@

from os import makedirs
from os import path
from urllib.request import urlopen, Request
from urllib.request import Request
from urllib.error import HTTPError
from zipfile import ZipFile, is_zipfile

from .base import ContentProvider
from .doi import DoiProvider
from ..utils import copytree, deep_get
from ..utils import normalize_doi, is_doi
from .. import __version__


class Zenodo(ContentProvider):
class Zenodo(DoiProvider):
"""Provide contents of a Zenodo deposit."""

def _urlopen(self, req, headers=None):
"""A urlopen() helper"""
# someone passed a string, not a request
if not isinstance(req, Request):
req = Request(req)

req.add_header("User-Agent", "repo2docker {}".format(__version__))
if headers is not None:
for key, value in headers.items():
req.add_header(key, value)

return urlopen(req)

def _doi2url(self, doi):
# Transform a DOI to a URL
# If not a doi, assume we have a URL and return
if is_doi(doi):
doi = normalize_doi(doi)

try:
resp = self._urlopen("https://doi.org/{}".format(doi))
# If the DOI doesn't resolve, just return URL
except HTTPError:
return doi
return resp.url
else:
# Just return what is actulally just a URL
return doi

def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
def __init__(self):
# We need the hostname (url where records are), api url (for metadata),
# filepath (path to files in metadata), filename (path to filename in
# metadata), download (path to file download URL), and type (path to item type in metadata)
hosts = [
self.hosts = [
{
"hostname": ["https://zenodo.org/record/", "http://zenodo.org/record/"],
"api": "https://zenodo.org/api/records/",
Expand All @@ -73,9 +40,11 @@ def detect(self, doi, ref=None, extra_args=None):
},
]

url = self._doi2url(doi)
def detect(self, doi, ref=None, extra_args=None):
"""Trigger this provider for things that resolve to a Zenodo/Invenio record"""
url = self.doi2url(doi)

for host in hosts:
for host in self.hosts:
if any([url.startswith(s) for s in host["hostname"]]):
self.record_id = url.rsplit("/", maxsplit=1)[1]
return {"record": self.record_id, "host": host}
Expand All @@ -90,53 +59,17 @@ def fetch(self, spec, output_dir, yield_output=False):
"{}{}".format(host["api"], record_id),
headers={"accept": "application/json"},
)
resp = self._urlopen(req)
resp = self.urlopen(req)

record = json.loads(resp.read().decode("utf-8"))

def _fetch(file_ref, unzip=False):
# the assumption is that `unzip=True` means that this is the only
# file related to the zenodo record
with self._urlopen(deep_get(file_ref, host["download"])) as src:
fname = deep_get(file_ref, host["filename"])
if path.dirname(fname):
sub_dir = path.join(output_dir, path.dirname(fname))
if not path.exists(sub_dir):
yield "Creating {}\n".format(sub_dir)
makedirs(sub_dir, exist_ok=True)

dst_fname = path.join(output_dir, fname)
with open(dst_fname, "wb") as dst:
yield "Fetching {}\n".format(fname)
shutil.copyfileobj(src, dst)
# first close the newly written file, then continue
# processing it
if unzip and is_zipfile(dst_fname):
yield "Extracting {}\n".format(fname)
zfile = ZipFile(dst_fname)
zfile.extractall(path=output_dir)
zfile.close()

# delete downloaded file ...
os.remove(dst_fname)
# ... and any directories we might have created,
# in which case sub_dir will be defined
if path.dirname(fname):
shutil.rmtree(sub_dir)

new_subdirs = os.listdir(output_dir)
# if there is only one new subdirectory move its contents
# to the top level directory
if len(new_subdirs) == 1:
d = new_subdirs[0]
copytree(path.join(output_dir, d), output_dir)
shutil.rmtree(path.join(output_dir, d))

is_software = deep_get(record, host["type"]).lower() == "software"
files = deep_get(record, host["filepath"])
only_one_file = len(files) == 1
for file_ref in files:
for line in _fetch(file_ref, unzip=is_software and only_one_file):
for line in self.fetch_file(
file_ref, host, output_dir, is_software and only_one_file
):
yield line

@property
Expand Down
41 changes: 41 additions & 0 deletions tests/unit/contentproviders/test_doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import json
import os
import re
import urllib
import pytest
import tempfile
import logging

from unittest.mock import patch, MagicMock, mock_open
from zipfile import ZipFile

from repo2docker.contentproviders.doi import DoiProvider
from repo2docker.contentproviders.base import ContentProviderException


def test_content_id():
doi = DoiProvider()
assert doi.content_id is None


def fake_urlopen(req):
print(req)
return req.headers


@patch("urllib.request.urlopen", fake_urlopen)
def test_url_headers():
doi = DoiProvider()

headers = {"test1": "value1", "Test2": "value2"}
result = doi.urlopen("https://mybinder.org", headers=headers)
assert "Test1" in result
assert "Test2" in result
assert len(result) is 3 # User-agent is also set


def test_unresolving_doi():
doi = DoiProvider()

fakedoi = "10.1/1234"
assert doi.doi2url(fakedoi) is fakedoi
Loading

0 comments on commit e33d5f8

Please sign in to comment.